% This file was created with JabRef 2.6.
% Encoding: UTF-8

@STRING{pldi = "Symp.\ on Programming Language Design and Implementation"}
@STRING{popl = "Symp.\ on Principles of Programming Languages"}

@InProceedings{shirako_12_cc,
  author = 	 { 
Jun Shirako and Kamal Sharma and Naznin Fauzia and Louis-No{\"e}l Pouchet and
J.~Ramanujam and P.~Sadayappan and Vivek Sarkar},
  title = 	 { Analytical Bounds for Optimal Tile Size Selection },
  booktitle = { ETAPS International Conference on Compiler Construction
  (CC'12)},
  address = { Tallinn, Estonia},
  month = 	 mar,
  year = 	 {2012},
  publisher =	 { Springer Verlag }
}

@INPROCEEDINGS{adl-tabatabai_compiler_2006,
  author = {{Adl-Tabatabai}, {Ali-Reza} and Lewis, Brian T. and Menon, Vijay
	and Murphy, Brian R. and Saha, Bratin and Shpeisman, Tatiana},
  title = {Compiler and runtime support for efficient software transactional
	memory},
  booktitle = {Proceedings of the 2006 {ACM} {SIGPLAN} conference on Programming
	language design and implementation},
  year = {2006},
  pages = {26--37},
  address = {Ottawa, Ontario, Canada},
  publisher = {{ACM}},
  abstract = {Programmers have traditionally used locks to synchronize concurrent
	access to shared data. Lock-based synchronization, however, has well-known
	pitfalls: using locks for fine-grain synchronization and composing
	code that already uses locks are both difficult and prone to deadlock.
	Transactional memory provides an alternate concurrency control mechanism
	that avoids these pitfalls and significantly eases concurrent programming.
	Transactional memory language constructs have recently been proposed
	as extensions to existing languages or included in new concurrent
	language specifications, opening the door for new compiler optimizations
	that target the overheads of transactional {memory.This} paper presents
	compiler and runtime optimizations for transactional memory language
	constructs. We present a high-performance software transactional
	memory system {(STM)} integrated into a managed runtime environment.
	Our system efficiently implements nested transactions that support
	both composition of transactions and partial roll back. Our {JIT}
	compiler is the first to optimize the overheads of {STM}, and we
	show novel techniques for enabling {JIT} optimizations on {STM} operations.
	We measure the performance of our optimizations on a 16-way {SMP}
	running multi-threaded transactional workloads. Our results show
	that these techniques enable transactional memory's performance to
	compete with that of well-tuned synchronization.},
  doi = {10.1145/1133981.1133985},
  isbn = {1-59593-320-4},
  keywords = {code generation, compiler optimizations, locking, read yes, synchronization,
	transactional memory, virtual machines},
  url = {http://portal.acm.org/citation.cfm?id=1133981.1133985}
}

@BOOK{alfred_compilers:_1986,
  title = {Compilers: principles, techniques, and tools},
  publisher = {Addison Wesley},
  year = {1986},
  author = {Alfred, V. A and Sethi, R. and Ullman, J. D},
  edition = {2},
  shorttitle = {Compilers}
}

@INPROCEEDINGS{ananian_unbounded_2005,
  author = {Ananian, C. Scott and Asanovic, Krste and Kuszmaul, Bradley C. and
	Leiserson, Charles E. and Lie, Sean},
  title = {Unbounded Transactional Memory},
  booktitle = {Proceedings of the 11th International Symposium on {High-Performance}
	Computer Architecture},
  year = {2005},
  pages = {316--327},
  publisher = {{IEEE} Computer Society},
  abstract = {Hardware transactional memory should support unbounded transactions:
	transactions of arbitrary size and duration. We describe a hardware
	implementation of unbounded transactional memory, called {UTM}, which
	exploits the common case for performance without sacrificing correctness
	on transactions whose footprint can be nearly as large as virtual
	memory. We performed a cycle-accurate simulation of a simplified
	architecture, called {LTM.} {LTM} is based on {UTM} but is easier
	to implement, because it does not change the memory subsystem outside
	of the processor. {LTM} allows nearly unbounded transactions, whose
	footprint is limited only by physical memory size and whose duration
	by the length of a timeslice. We assess {UTM} and {LTM} through microbenchmarking
	and by automatically converting the {SPECjvm98} Java benchmarks and
	the Linux 2.4.19 kernel to use transactions instead of locks. We
	use both cycle-accurate simulation and instrumentation to understand
	benchmark behavior. Our studies show that the common case is small
	transactions that commit, even when contention is high, but that
	some applications contain very large transactions. For example, although
	99.9\% of transactions in the Linux study touch 54 cache lines or
	fewer, some transactions touch over 8000 cache lines. Our studies
	also indicate that hardware support is required, because some applications
	spend over half their time in critical regions. Finally, they suggest
	that hardware support for transactions can make Java programs runfaster
	than when run using locks and can increase the concurrency of the
	Linux kernel by as much as a factor of 4 with no additional programming
	work.},
  isbn = {0-7695-2275-0},
  keywords = {read if time},
  url = {http://portal.acm.org/citation.cfm?id=1043430&dl=GUIDE&coll=GUIDE&CFID=47217513&CFTOKEN=59226035}
}

@ARTICLE{aslot_specomp:_2001,
  author = {Aslot, Vishal and Domeika, Max and Eigenmann, Rudolf and Gaertner,
	Greg and Jones, Wesley B and Parady, Bodo},
  title = {{SPEComp:} A New Benchmark Suite for Measuring Parallel Computer
	Performance},
  journal = {{IN} {WORKSHOP} {ON} {OPENMP} {APPLICATIONS} {AND} {TOOLS}},
  year = {2001},
  volume = {2104},
  pages = {1---10},
  shorttitle = {{SPEComp}},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.16.5625}
}

@ARTICLE{aspnes_counting_1994,
  author = {Aspnes, James and Herlihy, Maurice and Shavit, Nir},
  title = {Counting networks},
  journal = {J. {ACM}},
  year = {1994},
  volume = {41},
  pages = {1020--1048},
  number = {5},
  abstract = {Many fundamental multi-processor coordination problems can be expressed
	as counting problems: Processes must cooperate to assign successive
	values from a given range, such as addresses in memory or destinations
	on an interconnection network. Conventional solutions to these problems
	perform poorly because of synchronization bottlenecks and high memory
	{contention.Motivated} by observations on the behavior of sorting
	networks, we offer a new approach to solving such problems, by introducing
	counting networks, a new class of networks that can be used to count.
	We give two counting network constructions, one of depth log n(1
	\&plus; log n)/2 using n log (1 \&plus; log n)/4 {\textquotedblleft}gates,{\textquotedblright}
	and a second of depth log2 n using n log2 n/2 gates. These networks
	avoid the sequential bottlenecks inherent to earlier solutions and
	substantially lower the memory {contention.Finally}, to show that
	counting networks are not merely mathematical creatures, we provide
	experimental evidence that they outperform conventional synchronization
	techniques under a variety of circumstances.},
  doi = {10.1145/185675.185815},
  keywords = {counting networks, hot-sports, network routing, parallel processing},
  url = {http://portal.acm.org/citation.cfm?id=185815&dl=}
}

@TECHREPORT{u._banerjee_data_1976,
  author = {U. Banerjee},
  title = {Data dependence in ordinary programs},
  institution = {Dept. of Computer Science, University of Illinois at {Urbana-Champaign}},
  year = {1976},
  type = {Th\'{e}se de master},
  month = nov
}

@ARTICLE{banerjee_automatic_1993,
  author = {Banerjee, Utpal and Eigenmann, Rudolf and Nicolau, Alexandru and
	Padua, David A},
  title = {Automatic Program Parallelization},
  year = {1993},
  keywords = {read yes},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.50.6984}
}

@INPROCEEDINGS{benabderrahmane_polyhedral_2010,
  author = {Benabderrahmane, {M.-W.} and Pouchet, {L.-N.} and Cohen, Albert and
	Bastoul, Cedric},
  title = {The Polyhedral Model Is More Widely Applicable Than You Think},
  booktitle = {Proceedings of the International Conference on Compiler Construction
	{(ETAPS} {CC'10)}},
  year = {2010},
  series = {{LNCS}},
  address = {Paphos, Cyprus},
  month = mar,
  publisher = {{Springer-Verlag}}
}

@MISC{blume_polaris:_1994,
  author = {Blume, Bill and Eigenmann, Rudolf and Faigin, Keith and Grout, John
	and Hoe, Jay and Padua, David and Petersen, Paul and Pottenger, Bill
	and Rauchwerger, Lawrence and Tu, Peng and Weatherford, Stephen},
  title = {Polaris: The Next Generation in Parallelizing Compilers},
  howpublished = {http://eprints.kfupm.edu.sa/58245/},
  year = {1994},
  note = {It is the goal of the Polaris project to develop a new parallelizing
	compiler that will overcome limitations of current compilers. While
	current parallelizing compilers may succeed on small kernels, they
	often fail to extract any meaningful parallelism from large applications.
	After a study of application codes, it was concluded that by adding
	a few new techniques to current compilers, automatic parallelization
	becomes possible. The techniques needed are interprocedural analysis,
	scalar and array privatization, symbolic dependence analysis, and
	advanced induction and reduction recognition and elimination, along
	with run-time techniques to allow data dependent behavior. 1},
  keywords = {General},
  shorttitle = {Polaris},
  type = {Article},
  url = {http://eprints.kfupm.edu.sa/58245/}
}

@INPROCEEDINGS{bondhugula_practical_2008,
  author = {Bondhugula, Uday and Hartono, Albert and Ramanujam, J. and Sadayappan,
	P.},
  title = {A practical automatic polyhedral parallelizer and locality optimizer},
  booktitle = {Proceedings of the 2008 {ACM} {SIGPLAN} conference on Programming
	language design and implementation},
  year = {2008},
  pages = {101--113},
  address = {Tucson, {AZ}, {USA}},
  publisher = {{ACM}},
  abstract = {We present the design and implementation of an automatic polyhedral
	source-to-source transformation framework that can optimize regular
	programs (sequences of possibly imperfectly nested loops) for parallelism
	and locality simultaneously. Through this work, we show the practicality
	of analytical model-driven automatic transformation in the polyhedral
	model -- far beyond what is possible by current production compilers.
	Unlike previous works, our approach is an end-to-end fully automatic
	one driven by an integer linear optimization framework that takes
	an explicit view of finding good ways of tiling for parallelism and
	locality using affine transformations. The framework has been implemented
	into a tool to automatically generate {OpenMP} parallel code from
	C program sections. Experimental results from the tool show very
	high speedups for local and parallel execution on multi-cores over
	state-of-the-art compiler frameworks from the research community
	as well as the best native production compilers. The system also
	enables the easy use of powerful empirical/iterative optimization
	for general arbitrarily nested loop sequences.},
  doi = {10.1145/1375581.1375595},
  isbn = {978-1-59593-860-2},
  keywords = {affine transformations, automatic parallelization, locality optimization,
	loop transformations, polyhedral model, tiling},
  url = {http://portal.acm.org/citation.cfm?id=1375581.1375595}
}

@InProceedings{Iri88,
  author = 	 {F. Irigoin and R. Triolet},
  title = 	 {Supernode Partitioning},
  booktitle = 	 popl # { (POPL'88)},
  pages =	 {319--328},
  year =	 1988,
  address =	 {San Diego, CA},
  month =	 jan
}

@PHDTHESIS{bondhugula_effective_2010,
  author = {Bondhugula, U. {K.R}},
  title = {Effective automatic parallelization and locality optimization using
	the polyhedral model},
  school = {Citeseer},
  year = {2010}
}

@ARTICLE{bridges_revisiting_2008,
  author = {Bridges, Matthew J. and Vachharajani, Neil and Zhang, Yun and Jablin,
	Thomas and August, David I.},
  title = {Revisiting the Sequential Programming Model for the Multicore Era},
  journal = {{IEEE} Micro},
  year = {2008},
  volume = {28},
  pages = {12--20},
  number = {1},
  abstract = {Automatic parallelization has thus far not been successful at extracting
	scalable parallelism from general programs. An aggressive automatic
	thread extraction framework, coupled with natural extensions to the
	sequential programming model that allow for a range of legal outcomes
	rather than forcing programmers to define a single legal program
	outcome, will let programmers achieve the performance of parallel
	programming via the simpler sequential model.},
  keywords = {automatic parallelization, compiler-architecture interactions, compilers,
	sequential-programming model, thread extraction},
  url = {http://portal.acm.org/citation.cfm?id=1345875}
}

@INPROCEEDINGS{burke_interprocedural_1986,
  author = {Burke, Michael and Cytron, Ron},
  title = {Interprocedural dependence analysis and parallelization},
  booktitle = {Proceedings of the 1986 {SIGPLAN} symposium on Compiler construction},
  year = {1986},
  pages = {162--175},
  address = {Palo Alto, California, United States},
  publisher = {{ACM}},
  abstract = {We present a method that combines a deep analysis of program dependences
	with a broad analysis of the interaction among procedures. The method
	is more efficient than existing methods: we reduce many tests, performed
	separately by existing methods, to a single test. The method is more
	precise than existing methods with respect to references to multi-dimensional
	arrays and dependence information hidden by procedure calls. The
	method is more general than existing methods: we accommodate potentially
	aliased variables and structures of differing shapes that share storage.
	We accomplish the above through a unified approach that integrates
	subscript analysis with aliasing and interprocedural information.},
  doi = {10.1145/12276.13328},
  isbn = {0-89791-197-0},
  keywords = {read yes},
  url = {http://portal.acm.org/citation.cfm?id=12276.13328&type=series}
}

@PHDTHESIS{cedric_improving_2004,
  author = {C\'{e}dric, Bastoul},
  title = {Improving Data Locality in Static Control Programs},
  school = {University Paris 6, Pierre et Marie Curie, France},
  year = {2004},
  address = {Paris, France},
  month = dec
}

@ARTICLE{cascaval_software_2008,
  author = {Cascaval, Calin and Blundell, Colin and Michael, Maged and Cain,
	Harold W. and Wu, Peng and Chiras, Stefanie and Chatterjee, Siddhartha},
  title = {Software transactional memory: why is it only a research toy?},
  journal = {Commun. {ACM}},
  year = {2008},
  volume = {51},
  pages = {40--46},
  number = {11},
  abstract = {The promise of {STM} may likely be undermined by its overheads and
	workload applicabilities.},
  doi = {10.1145/1400214.1400228},
  keywords = {ok, P},
  shorttitle = {Software transactional memory},
  url = {http://portal.acm.org/ft_gateway.cfm?id=1400228&type=html&coll=GUIDE&dl=GUIDE&CFID=47424669&CFTOKEN=69787166}
}

@ARTICLE{cederman_gpu-quicksort:_2009,
  author = {Cederman, Daniel and Tsigas, Philippas},
  title = {{GPU-Quicksort:} A practical Quicksort algorithm for graphics processors},
  journal = {J. Exp. Algorithmics},
  year = {2009},
  volume = {14},
  pages = {1.4--1.24},
  abstract = {In this article, we describe {GPU-Quicksort}, an efficient Quicksort
	algorithm suitable for highly parallel multicore graphics processors.
	Quicksort has previously been considered an inefficient sorting solution
	for graphics processors, but we show that in {CUDA}, {NVIDIA's} programing
	platform for general-purpose computations on graphical processors,
	{GPU-Quicksort} performs better than the fastest-known sorting implementations
	for graphics processors, such as radix and bitonic sort. Quicksort
	can thus be seen as a viable alternative for sorting large quantities
	of data on graphics processors.},
  doi = {10.1145/1498698.1564500},
  keywords = {cuda, gpgpu, multicore, quicksort, sorting},
  shorttitle = {{GPU-Quicksort}},
  url = {http://portal.acm.org/citation.cfm?id=1498698.1564500&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@BOOK{chapman_using_2007,
  title = {Using {OpenMP:} Portable Shared Memory Parallel Programming {(Scientific}
	and Engineering Computation)},
  publisher = {The {MIT} Press},
  year = {2007},
  author = {Chapman, Barbara and Jost, Gabriele and Pas, Ruud van der},
  abstract = {{"I} hope that readers will learn to use the full expressibility and
	power of {OpenMP.} This book should provide an excellent introduction
	to beginners, and the performance section should help those with
	some experience who want to push {OpenMP} to its limits." --from
	the foreword by David J. Kuck, Intel Fellow, Software and Solutions
	Group, and Director, Parallel and Distributed Solutions, Intel Corporation
	{OpenMP}, a portable programming interface for shared memory parallel
	computers, was adopted as an informal standard in 1997 by computer
	scientists who wanted a unified model on which to base programs for
	shared memory systems. {OpenMP} is now used by many software developers;
	it offers significant advantages over both hand-threading and {MPI.}
	Using {OpenMP} offers a comprehensive introduction to parallel programming
	concepts and a detailed overview of {OpenMP.} Using {OpenMP} discusses
	hardware developments, describes where {OpenMP} is applicable, and
	compares {OpenMP} to other programming interfaces for shared and
	distributed memory parallel architectures. It introduces the individual
	features of {OpenMP}, provides many source code examples that demonstrate
	the use and functionality of the language constructs, and offers
	tips on writing an efficient {OpenMP} program. It describes how to
	use {OpenMP} in full-scale applications to achieve high performance
	on large-scale architectures, discussing several case studies in
	detail, and offers in-depth troubleshooting advice. It explains how
	{OpenMP} is translated into explicitly multithreaded code, providing
	a valuable behind-the-scenes account of {OpenMP} program performance.
	Finally, Using {OpenMP} considers trends likely to influence {OpenMP}
	development, offering a glimpse of the possibilities of a future
	{OpenMP} 3.0 from the vantage point of the current {OpenMP} 2.5.
	With multicore computer use increasing, the need for a comprehensive
	introduction and overview of the standard interface is clear. Using
	{OpenMP} provides an essential reference not only for students at
	both undergraduate and graduate levels but also for professionals
	who intend to parallelize existing codes or develop new parallel
	programs for shared memory computer architectures.},
  isbn = {0262533022, 9780262533027},
  shorttitle = {Using {OpenMP}},
  url = {http://portal.acm.org/citation.cfm?id=1370966#}
}

@ARTICLE{che_performance_2008,
  author = {Che, Shuai and Boyer, Michael and Meng, Jiayuan and Tarjan, David
	and Sheaffer, Jeremy W. and Skadron, Kevin},
  title = {A performance study of general-purpose applications on graphics processors
	using {CUDA}},
  journal = {J. Parallel Distrib. Comput.},
  year = {2008},
  volume = {68},
  pages = {1370--1380},
  number = {10},
  abstract = {Graphics processors {(GPUs)} provide a vast number of simple, data-parallel,
	deeply multithreaded cores and high memory bandwidths. {GPU} architectures
	are becoming increasingly programmable, offering the potential for
	dramatic speedups for a variety of general-purpose applications compared
	to contemporary general-purpose processors {(CPUs).} This paper uses
	{NVIDIA's} C-like {CUDA} language and an engineering sample of their
	recently introduced {GTX} 260 {GPU} to explore the effectiveness
	of {GPUs} for a variety of application types, and describes some
	specific coding idioms that improve their performance on the {GPU.}
	{GPU} performance is compared to both single-core and multicore {CPU}
	performance, with multicore {CPU} implementations written using {OpenMP.}
	The paper also discusses advantages and inefficiencies of the {CUDA}
	programming model and some desirable features that might allow for
	greater ease of use and also more readily support a larger body of
	applications.},
  keywords = {cuda, gpgpu, gpu, graphics processors, heterogeneous computing organizations,
	manycore, multicore, openmp, parallel programming},
  url = {http://portal.acm.org/citation.cfm?id=1412749.1412827&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@ARTICLE{cho_towards_????,
  author = {Cho, C. Y and Fowler, S. and Sagdeo, P.},
  title = {Towards Automatic Parallelization on Real Code}
}

@INCOLLECTION{cohen_parallelization_1999,
  author = {Cohen, Albert},
  title = {Parallelization via constrained storage mapping optimization},
  booktitle = {Intl. Symp. on High Performance Computing},
  publisher = {Springer-Verlag},
  year = {1999},
  editor = {Fukuda, Kazuki Joe Akira and Tomita, Shinji and Polychronopoulos,
	Constantine},
  volume = {1615},
  series = {LNCS},
  pages = {83--94},
}

@ARTICLE{cohen_optimization_1998,
  author = {Cohen, Albert and Lefebvre, Vincent},
  title = {Optimization of Storage Mappings for Parallel Programs},
  journal = {in Europar 99, number 1685 in {LNCS}},
  year = {1998},
  pages = {375---382},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.16.3134}
}

@INPROCEEDINGS{w._e._cohen_multiple_2003,
  author = {W. E. Cohen},
  title = {Multiple architecture characterization of the linux build process
	with {OProfile}},
  year = {2003}
}

@INPROCEEDINGS{davis_rose_1999,
  author = {Davis, K. and Quinlan, D.},
  title = {{ROSE} {II:} An optimizing code transformer for C++ object-oriented
	array class libraries},
  booktitle = {Third World Multiconference on Systemics, Cybernetics and Informatics
	{(SCI{\textquoteright}99)} and the Fifth International Conference
	on Information Systems Analysis and Synthesis {(ISAS{\textquoteright}99)}},
  year = {1999},
  shorttitle = {{ROSE} {II}}
}

@INPROCEEDINGS{dice_early_2009,
  author = {Dice, Dave and Lev, Yossi and Moir, Mark and Nussbaum, Daniel},
  title = {Early experience with a commercial hardware transactional memory
	implementation},
  booktitle = {Proceeding of the 14th international conference on Architectural
	support for programming languages and operating systems},
  year = {2009},
  pages = {157--168},
  address = {Washington, {DC}, {USA}},
  publisher = {{ACM}},
  abstract = {We report on our experience with the hardware transactional memory
	{(HTM)} feature of two pre-production revisions of a new commercial
	multicore processor. Our experience includes a number of promising
	results using {HTM} to improve performance in a variety of contexts,
	and also identifies some ways in which the feature could be improved
	to make it even better. We give detailed accounts of our experiences,
	sharing techniques we used to achieve the results we have, as well
	as describing challenges we faced in doing so.},
  doi = {10.1145/1508244.1508263},
  isbn = {978-1-60558-406-5},
  keywords = {hardware, synchronization, transactional memory},
  url = {http://portal.acm.org/citation.cfm?id=1508244.1508263}
}

@INCOLLECTION{dice_transactional_2006,
  author = {Dice, Dave and Shalev, Ori and Shavit, Nir},
  title = {Transactional Locking {II}},
  booktitle = {Distributed Computing},
  year = {2006},
  pages = {194--208},
  abstract = {The transactional memory programming paradigm is gaining momentum
	as the approach of choice for replacing locks in concurrent programming.
	This paper introduces the transactional locking {II} {(TL2)} algorithm,
	a software transactional memory {(STM)} algorithm based on a combination
	of commit-time locking and a novel global version-clock based validation
	technique. {TL2} improves on state-of-the-art {STMs} in the following
	ways: (1) unlike all other {STMs} it fits seamlessly with any system{\textquoteright}s
	memory life-cycle, including those using malloc/free (2) unlike all
	other lock-based {STMs} it efficiently avoids periods of unsafe execution,
	that is, using its novel version-clock validation, user code is guaranteed
	to operate only on consistent memory states, and (3) in a sequence
	of high performance benchmarks, while providing these new properties,
	it delivered overall performance comparable to (and in many cases
	better than) that of all former {STM} algorithms, both lock-based
	and non-blocking. Perhaps more importantly, on various benchmarks,
	{TL2} delivers performance that is competitive with the best hand-crafted
	fine-grained concurrent structures. Specifically, it is ten-fold
	faster than a single lock. We believe these characteristics make
	{TL2} a viable candidate for deployment of transactional memory today,
	long before hardware transactional support is available.},
  keywords = {ok},
  url = {http://dx.doi.org/10.1007/11864219_14}
}

@INPROCEEDINGS{dice_what_2006,
  author = {Dice, D. and Shavit, N.},
  title = {What really makes transactions faster},
  booktitle = {{TRANSACT} Workshop},
  year = {2006},
  keywords = {P, read yes}
}

@ARTICLE{dictionary_oxford:_2004,
  author = {Dictionary, O. E},
  title = {Oxford: Oxford University Press},
  journal = {Retrieved May},
  year = {2004},
  volume = {24},
  pages = {2004},
  shorttitle = {Oxford}
}

@ARTICLE{duff_sparse_1982,
  author = {Duff, Iain and Grimes, Roger and Lewis, John and Poole, Bill},
  title = {Sparse matrix test problems},
  journal = {{SIGNUM} Newsl.},
  year = {1982},
  volume = {17},
  pages = {22--22},
  number = {2},
  abstract = {The development, analysis and production of algorithms in sparse linear
	algebra often requires the use of test problems to demonstrate the
	effectiveness and applicability of the algorithms. Many algorithms
	have been developed in the context of specific application areas
	and have been tested in the context of sets of test problems collected
	by the developers. Comparisons of algorithms across application areas
	and comparisons between algorithms has often been incomplete, due
	to the lack of a comprehensive set of test problems. Additionally
	we believe that a comprehensive set of test problems will lead to
	a better understanding of the range of structures in sparse matrix
	problems and thence to better classification and development of algorithms.
	We have agreed to sponsor and maintain a general library of sparse
	matrix test problems, available on request to anyone for a nominal
	fee to cover postal charges. Contributors to the library will, of
	course, receive a free copy.},
  doi = {10.1145/1057588.1057590},
  url = {http://portal.acm.org/citation.cfm?id=1057588.1057590}
}

@INCOLLECTION{eigenmann_experience_1992,
  author = {Eigenmann, R. and Hoeflinger, J. and Li, Z. and Padua, D.},
  title = {Experience in the automatic parallelization of four {Perfect-Benchmark}
	programs},
  booktitle = {Languages and Compilers for Parallel Computing},
  year = {1992},
  pages = {65--83},
  abstract = {This paper discusses the techniques used to hand-parallelize, for
	the Alliant {FX/80}, four Fortran programs from the {Perfect-Benchmark}
	suite. The paper also includes the execution times of the programs
	before and after the transformations. The four programs considered
	here were not effectively parallelized by the automatic translators
	available to the authors. However, most of the techniques used for
	hand parallelization, and perhaps all of them, have wide applicability
	and can be incorporated into existing translators.},
  url = {http://dx.doi.org/10.1007/BFb0038658}
}

@ARTICLE{eigenmann_automatic_1998,
  author = {Eigenmann, Rudolf and Hoeflinger, Jay and Padua, David},
  title = {On the Automatic Parallelization of the Perfect Benchmarks{\textbackslash}\&{\textbackslash}\#174},
  journal = {{IEEE} Trans. Parallel Distrib. Syst.},
  year = {1998},
  volume = {9},
  pages = {5--23},
  number = {1},
  abstract = {This paper presents the results of the Cedar {Hand-Parallelization}
	Experiment, conducted from 1989 through 1992, within the Center for
	Supercomputing Research and Development {(CSRD)} at the University
	of Illinois. In this experiment, we manually transformed the Perfect
	Benchmarks{\textasciimacron} into parallel program versions. In doing
	so, we used techniques that may be automated in an optimizing compiler.
	We then ran these programs on the Cedar multiprocessor (built at
	{CSRD} during the 1980s) and measured the speed improvement due to
	each {technique.The} results presented here extend the findings previously
	reported in [11]. The techniques credited most for the performance
	gains include array privatization, parallelization of reduction operations,
	and the substitution of generalized induction variables. All these
	techniques can be considered extensions of transformations that were
	available in vectorizers and commercial restructuring compilers of
	the late 1980s. We applied these transformations by hand to the given
	programs, in a mechanical manner, similar to that of a parallelizing
	compiler. Because of our success with these transformations, we believed
	that it would be possible to implement many of these techniques in
	a new parallelizing compiler. Such a compiler has been completed
	in the meantime and we show preliminary results.},
  keywords = {parallelization techniques, performance evaluation., program parallelization,
	restructuring compilers},
  url = {http://portal.acm.org/citation.cfm?id=272875&dl=GUIDE&coll=GUIDE&CFID=69866626&CFTOKEN=27616381}
}

@ARTICLE{feautrier_dataflow_1991,
  author = {Feautrier, Paul},
  title = {Dataflow analysis of array and scalar references},
  journal = {International Journal of Parallel Programming},
  year = {1991},
  volume = {20},
  pages = {23--53},
  number = {1},
  month = feb,
  doi = {10.1007/BF01407931},
  issn = {0885-7458},
  url = {http://www.springerlink.com/content/k6253346u547n15v/}
}

@INPROCEEDINGS{feautrier_array_1988,
  author = {Feautrier, P.},
  title = {Array expansion},
  booktitle = {Proceedings of the 2nd international conference on Supercomputing},
  year = {1988},
  pages = {429--441},
  address = {St. Malo, France},
  publisher = {{ACM}},
  abstract = {A common problem in restructuring programs for vector or parallel
	execution is the suppression of false dependencies which originate
	in the reuse of the same memory cell for unrelated values. The method
	is simple and well understood in the case of scalars. This paper
	gives the general solution for the case of arrays. The expansion
	is done in two steps: first, modify all definitions of the offending
	array in order to obtain the single assignment property. Then, reconstruct
	the original data flow by adapting all uses of the array. This is
	done with the help of a new algorithm for solving parametric integer
	programs. The technique is quite general and may be used for other
	purposes, including program checking, collecting array predicates,
	etc{\textellipsis}},
  doi = {10.1145/55364.55406},
  isbn = {0-89791-272-1},
  url = {http://portal.acm.org/citation.cfm?id=55406}
}

@ARTICLE{feitelson_supercomputer_2005,
  author = {Feitelson, Dror G.},
  title = {The Supercomputer Industry in Light of the Top500 Data},
  journal = {Computing in Science and Engg.},
  year = {2005},
  volume = {7},
  pages = {42--47},
  number = {1},
  abstract = {The Top500 list, which has been updated semiannually for the past
	decade, ranks the 500 most powerful computers installed worldwide.
	Analyzing this data gives an impartial look at the supercomputer
	industry's current state and development trends, and sheds light
	on the challenges the industry faces.},
  keywords = {semiconductor, supercomputing, top 500},
  url = {http://portal.acm.org/citation.cfm?id=1042257}
}

@ARTICLE{fraser_concurrent_2007,
  author = {Fraser, Keir and Harris, Tim},
  title = {Concurrent programming without locks},
  journal = {{ACM} Trans. Comput. Syst.},
  year = {2007},
  volume = {25},
  pages = {5},
  number = {2},
  abstract = {Mutual exclusion locks remain the de facto mechanism for concurrency
	control on shared-memory data structures. However, their apparent
	simplicity is deceptive: It is hard to design scalable locking strategies
	because locks can harbor problems such as priority inversion, deadlock,
	and convoying. Furthermore, scalable lock-based systems are not readily
	composable when building compound operations. In looking for solutions
	to these problems, interest has developed in nonblocking systems
	which have promised scalability and robustness by eschewing mutual
	exclusion while still ensuring safety. However, existing techniques
	for building nonblocking systems are rarely suitable for practical
	use, imposing substantial storage overheads, serializing nonconflicting
	operations, or requiring instructions not readily available on today's
	{CPUs.}},
  doi = {10.1145/1233307.1233309},
  keywords = {concurrency, lock-free systems, read if time, transactional memory},
  url = {http://portal.acm.org/citation.cfm?id=1233309}
}

@INPROCEEDINGS{gotsman_proving_2009,
  author = {Gotsman, A. and Cook, B. and Parkinson, M. and Vafeiadis, V.},
  title = {Proving that non-blocking algorithms don't block},
  booktitle = {Proceedings of the 36th annual {ACM} {SIGPLAN-SIGACT} symposium on
	Principles of programming languages},
  year = {2009},
  pages = {16{\textendash}28},
  keywords = {read yes}
}

@ARTICLE{graham_gprof:_1982,
  author = {Graham, Susan L. and Kessler, Peter B. and Mckusick, Marshall K.},
  title = {Gprof: A call graph execution profiler},
  journal = {{SIGPLAN} Not.},
  year = {1982},
  volume = {17},
  pages = {120--126},
  number = {6},
  abstract = {Large complex programs are composed of many small routines that implement
	abstractions for the routines that call them. To be useful, an execution
	profiler must attribute execution time in a way that is significant
	for the logical structure of a program as well as for its textual
	decomposition. This data must then be displayed to the user in a
	convenient and informative way. The gprof profiler accounts for the
	running time of called routines in the running time of the routines
	that call them. The design and use of this profiler is described.},
  doi = {10.1145/872726.806987},
  shorttitle = {Gprof},
  url = {http://portal.acm.org/citation.cfm?id=872726.806987}
}

@MISC{Gri04b,
  author = {Griebl, M.},
  title = {Automatic parallelization of loop programs for distributed memory
	architectures. {H}abilitation thesis. {F}acult{\"a}t f{\"u}r {M}athematik
	und {I}nformatik, {U}niversit{\"a}t {P}assau},
  year = {2004},
  citeulike-article-id = {1432913},
  keywords = {bibtex-import},
  posted-at = {2007-07-04 13:13:25},
  priority = {2},
  school = {Facult{\"a}t f{\"u}r Mathematik und Informatik, Universit{\"a}t Passau}
}

@ARTICLE{gupta_privatization_1997,
  author = {Gupta, Manish},
  title = {On Privatization of Variables for {Data-Parallel} Execution},
  journal = {{IN} {PROCEEDINGS} {OF} {THE} {11TH} {INTERNATIONAL} {PARALLEL} {PROCESSING}
	{SYMPOSIUM}},
  year = {1997},
  pages = {533---541},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.50.2508}
}

@INPROCEEDINGS{hamada_42_2009,
  author = {Hamada, Tsuyoshi and Narumi, Tetsu and Yokota, Rio and Yasuoka, Kenji
	and Nitadori, Keigo and Taiji, Makoto},
  title = {42 {TFlops} hierarchical {{\textless}i{\textgreater}N{\textless}/i{\textgreater}-body}
	simulations on {GPUs} with applications in both astrophysics and
	turbulence},
  booktitle = {Proceedings of the Conference on High Performance Computing Networking,
	Storage and Analysis},
  year = {2009},
  pages = {1--12},
  address = {Portland, Oregon},
  publisher = {{ACM}},
  abstract = {As an entry for the 2009 Gordon Bell price/performance prize, we present
	the results of two different hierarchical N-body simulations on a
	cluster of 256 graphics processing units {(GPUs).} Unlike many previous
	N-body simulations on {GPUs} that scale as {O(N2)}, the present method
	calculates the {O(N} log N) treecode and {O(N)} fast multipole method
	{(FMM)} on the {GPUs} with unprecedented efficiency. We demonstrate
	the performance of our method by choosing one standard application
	--a gravitational N-body simulation-- and one non-standard application
	--simulation of turbulence using vortex particles. The gravitational
	simulation using the treecode with 1,608,044,129 particles showed
	a sustained performance of 42.15 {TFlops.} The vortex particle simulation
	of homogeneous isotropic turbulence using the periodic {FMM} with
	16,777,216 particles showed a sustained performance of 20.2 {TFlops.}
	The overall cost of the hardware was 228,912 dollars. The maximum
	corrected performance is {28.1TFlops} for the gravitational simulation,
	which results in a cost performance of 124 {MFlops/\$.} This correction
	is performed by counting the Flops based on the most efficient {CPU}
	algorithm. Any extra Flops that arise from the {GPU} implementation
	and parameter differences are not included in the 124 {MFlops/\$.}},
  doi = {10.1145/1654059.1654123},
  isbn = {978-1-60558-744-8},
  url = {http://portal.acm.org/citation.cfm?id=1654059.1654123&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@ARTICLE{harris_language_2003,
  author = {Harris, Tim and Fraser, Keir},
  title = {Language support for lightweight transactions},
  journal = {{SIGPLAN} Not.},
  year = {2003},
  volume = {38},
  pages = {388--402},
  number = {11},
  abstract = {Concurrent programming is notoriously difficult. Current abstractions
	are intricate and make it hard to design computer systems that are
	reliable and scalable. We argue that these problems can be addressed
	by moving to a declarative style of concurrency control in which
	programmers directly indicate the safety properties that they require.
	In our scheme the programmer demarks sections of code which execute
	within lightweight software-based transactions that commit atomically
	and exactly once. These transactions can update shared data, instantiate
	objects, invoke library features and so on. They can also block,
	waiting for arbitrary boolean conditions to become true. Transactions
	which do not access the same shared memory locations can commit concurrently.
	Furthermore, in general, no performance penalty is incurred for memory
	accesses outside {transactions.We} present a detailed design of this
	proposal along with an implementation and evaluation. We argue that
	the resulting system (i) is easier for mainstream programmers to
	use, (ii) prevents lock-based priority-inversion and deadlock problems
	and (iii) can offer performance advantages.},
  doi = {10.1145/949343.949340},
  keywords = {concurrency, conditional critical regions, non-blocking systems, read
	if time, transactions},
  url = {http://portal.acm.org/citation.cfm?id=949340}
}

@INPROCEEDINGS{harris_composable_2005,
  author = {Harris, Tim and Marlow, Simon and {Peyton-Jones}, Simon and Herlihy,
	Maurice},
  title = {Composable memory transactions},
  booktitle = {Proceedings of the tenth {ACM} {SIGPLAN} symposium on Principles
	and practice of parallel programming},
  year = {2005},
  pages = {48--60},
  address = {Chicago, {IL}, {USA}},
  publisher = {{ACM}},
  abstract = {Writing concurrent programs is notoriously difficult, and is of increasing
	practical importance. A particular source of concern is that even
	correctly-implemented concurrency abstractions cannot be composed
	together to form larger abstractions. In this paper we present a
	new concurrency model, based on transactional memory, that offers
	far richer composition. All the usual benefits of transactional memory
	are present (e.g. freedom from deadlock), but in addition we describe
	new modular forms of blocking and choice that have been inaccessible
	in earlier work.},
  doi = {10.1145/1065944.1065952},
  isbn = {1-59593-080-9},
  keywords = {locks, non-blocking algorithms, read if time, transactions},
  url = {http://portal.acm.org/citation.cfm?id=1065952}
}

@ARTICLE{harris_optimizing_2006,
  author = {Harris, Tim and Plesko, Mark and Shinnar, Avraham and Tarditi, David},
  title = {Optimizing memory transactions},
  journal = {{SIGPLAN} Not.},
  year = {2006},
  volume = {41},
  pages = {14--25},
  number = {6},
  abstract = {Atomic blocks allow programmers to delimit sections of code as 'atomic',
	leaving the language's implementation to enforce atomicity. Existing
	work has shown how to implement atomic blocks over word-based transactional
	memory that provides scalable multi-processor performance without
	requiring changes to the basic structure of objects in the heap.
	However, these implementations perform poorly because they interpose
	on all accesses to shared memory in the atomic block, redirecting
	updates to a thread-private log which must be searched by reads in
	the block and later reconciled with the heap when leaving the {block.This}
	paper takes a four-pronged approach to improving performance: (1)
	we introduce a new 'direct access' implementation that avoids searching
	thread-private logs, (2) we develop compiler optimizations to reduce
	the amount of logging (e.g. when a thread accesses the same data
	repeatedly in an atomic block), (3) we use runtime filtering to detect
	duplicate log entries that are missed statically, and (4) we present
	a series of {GC-time} techniques to compact the logs generated by
	long-running atomic {blocks.Our} implementation supports short-running
	scalable concurrent benchmarks with less than 50{\textbackslash}\%
	overhead over a non-thread-safe baseline. We support long atomic
	blocks containing millions of shared memory accesses with a 2.5-4.5x
	slowdown.},
  doi = {10.1145/1133255.1133984},
  keywords = {atomicity, critical regions, read yes, transactional memory},
  url = {http://portal.acm.org/citation.cfm?id=1133984}
}

@ARTICLE{hassan_common_2006,
  author = {Hassan, Jaewoong Chung and Chafi, Hassan and Minh, Chi Cao and Mcdonald,
	Austen and Carlstrom, Brian D and Kozyrakis, Christos and Olukotun,
	Kunle},
  title = {The Common Case Transactional Behavior of Multithreaded Programs},
  journal = {{IN} {PROCEEDINGS} {OF} {THE} {12TH} {INTERNATIONAL} {CONFERENCE}
	{ON} {HIGH-PERFORMANCE} {COMPUTER} {ARCHITECTURE}},
  year = {2006},
  keywords = {P, read if time},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.60.6963}
}

@ARTICLE{he_preemption_2005,
  author = {He, Bijun and {III}, William N. Scherer and Scott, Michael L},
  title = {Preemption Adaptivity in {Time-Published} {Queue-Based} Spin Locks},
  journal = {{IN} {PROCEEDINGS} {OF} {THE} {11TH} {INTERNATIONAL} {CONFERENCE}
	{ON} {HIGH} {PERFORMANCE} {COMPUTING}},
  year = {2005},
  keywords = {read no},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.8072}
}

@INPROCEEDINGS{hendren_designing_1993,
  author = {Hendren, Laurie J. and Donawa, C. and Emami, Maryam and Gao, Guang
	R. and Justiani and Sridharan, B.},
  title = {Designing the {McCAT} Compiler Based on a Family of Structured Intermediate
	Representations},
  booktitle = {Proceedings of the 5th International Workshop on Languages and Compilers
	for Parallel Computing},
  year = {1993},
  pages = {406--420},
  publisher = {{Springer-Verlag}},
  isbn = {3-540-57502-2},
  url = {http://portal.acm.org/citation.cfm?id=665360}
}

@ARTICLE{henning_spec_2000,
  author = {Henning, John L.},
  title = {{SPEC} {CPU2000:} Measuring {CPU} Performance in the New Millennium},
  journal = {Computer},
  year = {2000},
  volume = {33},
  pages = {28--35},
  number = {7},
  abstract = {As computers and software have become more powerful, it seems almost
	human nature to want the biggest and fastest toy you can afford.
	But how do you know if your toy is tops? Even if your application
	never does any {I/O}, it's not just the speed of the {CPU} that dictates
	performance. Cache, main memory, and compilers also play a role.
	Software applications also have differing performance requirements.
	So whom do you trust to provide this information? The Standard Performance
	Evaluation Corporation {(SPEC)} is a nonprofit consortium whose members
	include hardware vendors, software vendors, universities, customers,
	and consultants. {SPEC's} mission is to develop technically credible
	and objective component- and system-level benchmarks for multiple
	operating systems and environments, including high-performance numeric
	computing, Web servers, and graphical subsystems. On 30 June 2000,
	{SPEC} retired the {CPU95} benchmark suite. Its replacement is {CPU2000},
	a new {CPU} benchmark suite with 19 applications that have never
	before been in a {SPEC} {CPU} suite. This article discusses how {SPEC}
	developed this benchmark suite and what the benchmarks do},
  issn = {0018-9162},
  shorttitle = {{SPEC} {CPU2000}}
}

@INPROCEEDINGS{herlihy_software_2003,
  author = {Herlihy, Maurice and Luchangco, Victor and Moir, Mark and William
	N. Scherer, I. I. I.},
  title = {Software transactional memory for dynamic-sized data structures},
  booktitle = {Proceedings of the twenty-second annual symposium on Principles of
	distributed computing},
  year = {2003},
  pages = {92--101},
  address = {Boston, Massachusetts},
  publisher = {{ACM}},
  abstract = {We propose a new form of software transactional memory {(STM)} designed
	to support dynamic-sized data structures, and we describe a novel
	non-blocking implementation. The non-blocking property we consider
	is obstruction-freedom. Obstruction-freedom is weaker than lock-freedom;
	as a result, it admits substantially simpler and more efficient implementations.
	A novel feature of our obstruction-free {STM} implementation is its
	use of modular contention managers to ensure progress in practice.
	We illustrate the utility of our dynamic {STM} with a straightforward
	implementation of an obstruction-free red-black tree, thereby demonstrating
	a sophisticated non-blocking dynamic data structure that would be
	difficult to implement by other means. We also present the results
	of simple preliminary performance experiments that demonstrate that
	an "early release" feature of our {STM} is useful for reducing contention,
	and that our {STM} lends itself to the effective use of modular contention
	managers.},
  doi = {10.1145/872035.872048},
  isbn = {1-58113-708-7},
  keywords = {ok},
  url = {http://portal.acm.org/citation.cfm?id=872048}
}

@ARTICLE{herlihy_transactional_1993,
  author = {Herlihy, Maurice and Moss, J. Eliot B and Eliot, J. and Moss, B.},
  title = {Transactional Memory: Architectural Support for {Lock-Free} Data
	Structures},
  journal = {{IN} {PROCEEDINGS} {OF} {THE} {20TH} {ANNUAL} {INTERNATIONAL} {SYMPOSIUM}
	{ON} {COMPUTER} {ARCHITECTURE}},
  year = {1993},
  pages = {289---300},
  keywords = {read if time},
  shorttitle = {Transactional Memory},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.5910}
}

@ARTICLE{herlihy_linearizability:_1990,
  author = {Herlihy, Maurice P. and Wing, Jeannette M.},
  title = {Linearizability: a correctness condition for concurrent objects},
  journal = {{ACM} Trans. Program. Lang. Syst.},
  year = {1990},
  volume = {12},
  pages = {463--492},
  number = {3},
  abstract = {A concurrent object is a data object shared by concurrent processes.
	Linearizability is a correctness condition for concurrent objects
	that exploits the semantics of abstract data types. It permits a
	high degree of concurrency, yet it permits programmers to specify
	and reason about concurrent objects using known techniques from the
	sequential domain. Linearizability provides the illusion that each
	operation applied by concurrent processes takes effect instantaneously
	at some point between its invocation and its response, implying that
	the meaning of a concurrent object's operations can be given by pre-
	and post-conditions. This paper defines linearizability, compares
	it to other correctness conditions, presents and demonstrates a method
	for proving the correctness of implementations, and shows how to
	reason about concurrent objects, given they are linearizable.},
  doi = {10.1145/78969.78972},
  keywords = {read if time},
  shorttitle = {Linearizability},
  url = {http://portal.acm.org/citation.cfm?id=78969.78972}
}

@MISC{huang_transforming_2010,
  author = {Huang, Yuanjie and Peng, Liang and Wu, Chengyong and Kashnikov, Yuriy
	and Rennecke, J\"{o}rn and Fursin, Grigori},
  title = {Transforming {GCC} into a research-friendly environment: plugins
	for optimization tuning and reordering, function cloning and program
	instrumentation},
  howpublished = {http://hal.inria.fr/inria-00451106\_v1/},
  month = jan,
  year = {2010},
  note = {Computer scientists are always eager to have a powerful, robust and
	stable compiler infrastructure. However, until recently, researchers
	had to either use available and often unstable research compilers,
	create new ones from scratch, try to hack open-source non-research
	compilers or use source to source tools. It often requires duplication
	of a large amount of functionality available in current production
	compilers while making questionable the practicality of the obtained
	research results. The Interactive Compilation Interface {(ICI)} has
	been introduced to avoid such time-consuming replication and transform
	popular, production compilers such as {GCC} into research toolsets
	by providing an ability to access, modify and extend {GCC's} internal
	functionality through a compiler-dependent hook and clear compiler-independent
	{API} with external portable plugins without interrupting the natural
	evolution of a compiler. In this paper, we describe our recent extensions
	to {GCC} and {ICI} with the preliminary experimental data to support
	selection and reordering of optimization passes with a dependency
	grammar, control of individual transformations and their parameters,
	generic function cloning and program instrumentation. We are synchronizing
	these developments implemented during Google Summer of Code'09 program
	with the mainline {GCC} 4.5 and its native low-level plugin system.
	These extensions are intended to enable and popularize the use of
	{GCC} for realistic research on empirical iterative feedback-directed
	compilation, statistical collective optimization, run-time adaptation
	and development of intelligent self-tuning computing systems among
	other important topics. Such research infrastructure should help
	researchers prototype and validate their ideas quickly in realistic,
	production environments while keeping portability of their research
	plugins across different releases of a compiler. Moreover, it should
	also allow to move successful ideas back to {GCC} much faster thus
	helping to improve, modularize and clean it up. Furthermore, we are
	porting {GCC} with {ICI} extensions for performance/power auto-tuning
	for data centers and cloud computing systems with heterogeneous architectures
	or for continuous whole system optimization.},
  shorttitle = {Transforming {GCC} into a research-friendly environment},
  url = {http://hal.inria.fr/inria-00451106_v1/}
}

@INPROCEEDINGS{hudson_mcrt-malloc:_2006,
  author = {Hudson, Richard L. and Saha, Bratin and {Adl-Tabatabai}, {Ali-Reza}
	and Hertzberg, Benjamin C.},
  title = {{McRT-Malloc:} a scalable transactional memory allocator},
  booktitle = {Proceedings of the 5th international symposium on Memory management},
  year = {2006},
  pages = {74--83},
  address = {Ottawa, Ontario, Canada},
  publisher = {{ACM}},
  abstract = {Emerging multi-core processors promise to provide an exponentially
	increasing number of hardware threads with every generation. Applications
	will need to be highly concurrent to fullyuse the power of these
	processors. To enable maximum concurrency, libraries (such as malloc-free
	packages) would therefore need to use non-blocking algorithms. But
	lock-free algorithms are notoriously difficult to reason about and
	inappropriate for average programmers. Transactional memory promises
	to significantly ease concurrent programming for the average programmer.
	This paper describes a highly efficient non-blocking malloc/free
	algorithm that supports memory allocation and deallocation inside
	transactional code blocks. Thus this paper describes a memory allocator
	that is suitable for emerging multi-core applications, while supporting
	modern concurrency {constructs.This} paper makes several novel contributions.
	It is the first to integrate a software transactional memory system
	with a malloc/free based memory allocator. We present the first algorithm
	which ensures that space allocated in an aborted transaction is properly
	freed and does not lead to a space blowup. Unlike previous lock-free
	malloc packages, our algorithm avoids atomic operations on typical
	code paths, making our algorithm substantially more efficient.},
  doi = {10.1145/1133956.1133967},
  isbn = {1-59593-221-6},
  keywords = {memory management, read if time, runtimes, synchronization, transactional
	memory},
  shorttitle = {{McRT-Malloc}},
  url = {http://portal.acm.org/citation.cfm?id=1133956.1133967}
}

@INPROCEEDINGS{kejariwal_tight_2007,
  author = {Kejariwal, Arun and Tian, Xinmin and Girkar, Milind and Li, Wei and
	Kozhukhov, Sergey and Banerjee, Utpal and Nicolau, Alexander and
	Veidenbaum, Alexander V. and Polychronopoulos, Constantine D.},
  title = {Tight analysis of the performance potential of thread speculation
	using spec {CPU} 2006},
  booktitle = {Proceedings of the 12th {ACM} {SIGPLAN} symposium on Principles and
	practice of parallel programming},
  year = {2007},
  pages = {215--225},
  address = {San Jose, California, {USA}},
  publisher = {{ACM}},
  abstract = {Multi-cores such as the Intel{\textregistered}1 Core{\texttrademark}2
	Duo processor, facilitate efficient thread-level parallel execution
	of ordinary programs, wherein the different threads-of-execution
	are mapped onto different physical processors. In this context, several
	techniques have been proposed for auto-parallelization of programs.
	Recently, thread-level speculation {(TLS)} has been proposed as a
	means to parallelize difficult-to-analyze serial codes. In general,
	more than one technique can be employed for parallelizing a given
	program. The overlapping nature of the applicability of the various
	techniques makes it hard to assess the intrinsic performance potential
	of each. In this paper, we present a tight analysis of the (unique)
	performance potential of both: (a) {TLS} in general and (b) specific
	types of thread-level speculation, viz., control speculation, data
	dependence speculation and data value speculation, for the {SPEC2}
	{CPU2006} benchmark suite in light of the various limiting factors
	such as the threading overhead and misspeculation penalty. To the
	best of our knowledge, this is the first evaluation of {TLS} based
	on {SPEC} {CPU2006} and accounts for the aforementioned real-life
	con-straints. Our analysis shows that, at the innermost loop level,
	the upper bound on the speedup uniquely achievable via {TLS} with
	the state-of-the-art thread implementations for both {SPEC} {CINT2006}
	and {CFP2006} is of the order of 1\%.},
  doi = {10.1145/1229428.1229475},
  isbn = {978-1-59593-602-8},
  keywords = {conflict probability, misspeculation penalty, performance evaluation,
	speculative execution, threading overhead},
  url = {http://portal.acm.org/citation.cfm?id=1229475}
}

@ARTICLE{kejariwal_performance_2006,
  author = {Kejariwal, Arun and Tian, Xinmin and Li, Wei and Girkar, Milind and
	Kozhukhov, Sergey and Saito, Hideki and Banerjee, Utpal and Nicolau,
	Alexandru and Veidenbaum, Alexander V and Polychronopoulos, Constantine
	D and Alex, Ru Nicolau},
  title = {On the Performance Potential of Different Types of Speculative {Thread-Level}
	Parallelism},
  year = {2006},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.61.7541}
}

@ARTICLE{kennedy_compiler_1994,
  author = {Kennedy, Ken},
  title = {Compiler technology for machine-indepenent parallel programming},
  journal = {International Journal of Parallel Programming},
  year = {1994},
  volume = {22},
  pages = {79--98},
  number = {1},
  month = feb,
  abstract = {Abstract Historically, the principal achievement of compiler technology
	has been to make it possible to program in a high-level, machine-independent
	style. The absence of compiler technology to provide such a style
	for parallel computers is the main reason these systems have not
	found widespread acceptance. This paper examines the prospects for
	machine-independent parallel programming, concentrating on Fortran
	D and High Performance Fortran, which support machine-independent
	expression of {\textquotedblleft}data parallelism.{\textquotedblright}},
  doi = {10.1007/BF02577793},
  url = {http://dx.doi.org/10.1007/BF02577793}
}

@BOOK{kennedy_optimizing_2002,
  title = {Optimizing compilers for modern architectures: a dependence-based
	approach},
  publisher = {Morgan Kaufmann Publishers Inc.},
  year = {2002},
  author = {Kennedy, Ken and Allen, John R.},
  abstract = {Modern computer architectures designed with high-performance microprocessors
	offer tremendous potential gains in performance over previous designs.
	Yet their very complexity makes it increasingly difficult to produce
	efficient code and to realize their full potential. This landmark
	text from two leaders in the field focuses on the pivotal role that
	compilers can play in addressing this critical issue. The basis for
	all the methods presented in this book is data dependence, a fundamental
	compiler analysis tool for optimizing programs on high-performance
	microprocessors and parallel architectures. It enables compiler designers
	to write compilers that automatically transform simple, sequential
	programs into forms that can exploit special features of these modern
	architectures. The text provides a broad introduction to data dependence,
	to the many transformation strategies it supports, and to its applications
	to important optimization problems such as parallelization, compiler
	memory hierarchy management, and instruction scheduling. The authors
	demonstrate the importance and wide applicability of dependence-based
	compiler optimizations and give the compiler writer the basics needed
	to understand and implement them. They also offer cookbook explanations
	for transforming applications by hand to computational scientists
	and engineers who are driven to obtain the best possible performance
	of their complex applications.},
  isbn = {1-55860-286-0},
  shorttitle = {Optimizing compilers for modern architectures},
  url = {http://portal.acm.org/citation.cfm?id=502981}
}

@ARTICLE{Kno94,
  author = {J. Knoop and O. R\"uthing and B. Steffen},
  title = {Optimal Code Motion: Theory and Practice},
  journal = {ACM Trans. on Programming Languages and Systems},
  year = {1994},
  volume = {16},
  pages = {1117--1155},
  number = {4}
}

@INPROCEEDINGS{konrad_elimination_2011,
  author = {Trifunovic, Konrad and Cohen, Albert and Ladelski, Razya and 
	Li, Feng},
  title = {Elimination of {Memory-Based} Dependences for {Loop-Nest} Optimization
	and Parallelization},
  booktitle = {3rd {GCC} Research Opportunities Workshop},
  year = {2011},
  address = {Chamonix, France}
}

@INPROCEEDINGS{kulkarni_how_2009,
  author = {Kulkarni, Milind and Burtscher, Martin and Inkulu, Rajeshkar and
	Pingali, Keshav and Cas\c{c}aval, Calin},
  title = {How much parallelism is there in irregular applications?},
  booktitle = {Proceedings of the 14th {ACM} {SIGPLAN} symposium on Principles and
	practice of parallel programming},
  year = {2009},
  pages = {3--14},
  address = {Raleigh, {NC}, {USA}},
  publisher = {{ACM}},
  abstract = {Irregular programs are programs organized around pointer-based data
	structures such as trees and graphs. Recent investigations by the
	Galois project have shown that many irregular programs have a generalized
	form of data-parallelism called amorphous data-parallelism. However,
	in many programs, amorphous data-parallelism cannot be uncovered
	using static techniques, and its exploitation requires runtime strategies
	such as optimistic parallel execution. This raises a natural question:
	how much amorphous data-parallelism actually exists in irregular
	programs?},
  doi = {10.1145/1504176.1504181},
  isbn = {978-1-60558-397-6},
  keywords = {optimistic parallelism, parallelism profiles, profiling},
  url = {http://portal.acm.org/citation.cfm?id=1504176.1504181}
}

@ARTICLE{lamport_how_1979,
  author = {Lamport, L.},
  title = {How to Make a Multiprocessor Computer That Correctly Executes Multiprocess
	Progranm},
  journal = {Computers, {IEEE} Transactions on},
  year = {1979},
  volume = {C-28},
  pages = {690--691},
  number = {9},
  abstract = {Many large sequential computers execute operations in a different
	order than is specified by the program. A correct execution is achieved
	if the results produced are the same as would be produced by executing
	the program steps in order. For a multiprocessor computer, such a
	correct execution by each processor does not guarantee the correct
	execution of the entire program. Additional conditions are given
	which do guarantee that a computer correctly executes multiprocess
	programs.},
  doi = {10.1109/TC.1979.1675439},
  issn = {0018-9340},
  keywords = {Computer design, concurrent computing, hardware correctness, multiprocessing,
	parallel processing}
}

@ARTICLE{lamport_new_1974,
  author = {Lamport, Leslie},
  title = {A new solution of Dijkstra's concurrent programming problem},
  journal = {Commun. {ACM}},
  year = {1974},
  volume = {17},
  pages = {453--455},
  number = {8},
  abstract = {A simple solution to the mutual exclusion problem is presented which
	allows the system to continue to operate despite the failure of any
	individual component.},
  doi = {10.1145/361082.361093},
  keywords = {concurrent programming, critical section, multiprocessing, semaphores},
  url = {http://portal.acm.org/citation.cfm?id=361093}
}

@INPROCEEDINGS{langdon_fast_2009,
  author = {Langdon, W. B.},
  title = {A fast high quality pseudo random number generator for {nVidia} {CUDA}},
  booktitle = {Proceedings of the 11th Annual Conference Companion on Genetic and
	Evolutionary Computation Conference: Late Breaking Papers},
  year = {2009},
  pages = {2511--2514},
  address = {Montreal, Qu\'{e}bec, Canada},
  publisher = {{ACM}},
  abstract = {Previously either due to hardware {GPU} limits or older versions of
	software, careful implementation of {PRNGs} was required to make
	good use of the limited numerical precision available on graphics
	cards. Newer {nVidia} G80 and Tesla hardware support double precision.
	This is available to high level programmers via {CUDA.} This allows
	a much simpler C++ implementation of {Park-Miller} random numbers,
	which provides a four fold speed up compared to an earlier {GPU}
	implementation. Code is available via {http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/gp-code/random-numbers/cuda\_park-miller.tar.gz}},
  doi = {10.1145/1570256.1570353},
  isbn = {978-1-60558-505-5},
  keywords = {gpgpu, gpu, park-miller, prng, tesla},
  url = {http://portal.acm.org/citation.cfm?id=1570256.1570353&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@ARTICLE{larus_transactional_2006,
  author = {Larus, James R. and Rajwar, Ravi},
  title = {Transactional Memory},
  journal = {Synthesis Lectures on Computer Architecture},
  year = {2006},
  volume = {1},
  pages = {1--226},
  number = {1},
  month = jan,
  doi = {10.2200/S00070ED1V01Y200611CAC002},
  issn = {1935-3235},
  keywords = {ok},
  url = {http://www.morganclaypool.com/doi/abs/10.2200/S00070ED1V01Y200611CAC002}
}

@ARTICLE{lefebvre_automatic_1998,
  author = {Lefebvre, Vincent and Feautrier, Paul},
  title = {Automatic storage management for parallel programs},
  journal = {Parallel Computing},
  year = {1998},
  volume = {24},
  pages = {649--671},
  doi = {10.1016/S0167-8191(98)00029-5},
  issn = {01678191}
}

@ARTICLE{lev_debugging_2006,
  author = {Lev, Y. and Moir, M.},
  title = {Debugging with transactional memory},
  journal = {{TRANSACT} 2006},
  year = {2006},
  keywords = {read no}
}

@INPROCEEDINGS{li_array_1992,
  author = {Li, Zhiyuan},
  title = {Array privatization for parallel execution of loops},
  booktitle = {Proceedings of the 6th international conference on Supercomputing},
  year = {1992},
  pages = {313--322},
  address = {Washington, D. C., United States},
  publisher = {{ACM}},
  abstract = {In recent experiments, array privatization played a critical role
	in successful parallelization of several real programs. This paper
	presents compiler algorithms for the program analysis for this transformation.
	The paper also addresses issues in the implementation.},
  doi = {10.1145/143369.143426},
  isbn = {0-89791-485-6},
  url = {http://portal.acm.org/citation.cfm?id=143369.143426}
}

@ARTICLE{lim_maximizing_1997,
  author = {Lim, Amy W and Lam, Monica S},
  title = {Maximizing parallelism and minimizing synchronization with affine
	transforms},
  journal = {Parallel Computing},
  year = {1997},
  volume = {24},
  pages = {201---214},
  doi = {10.1.1.23.6838},
  url = {http://citeseerx.ksu.edu.sa/viewdoc/summary?doi=10.1.1.23.6838}
}

@MISC{louis-noel_polybench_2010,
  author = {{Louis-Noel}, Pouchet},
  title = {{PolyBench} suite},
  howpublished = {http://www.cse.ohio-state.edu/{\textasciitilde}pouchet/software/polybench/},
  year = {2010},
  url = {http://www.cse.ohio-state.edu/~pouchet/software/polybench/}
}

@ARTICLE{mankin_software_2009,
  author = {Mankin, Jennifer and Kaeli, David and Ardini, John},
  title = {Software transactional memory for multicore embedded systems},
  journal = {{SIGPLAN} Not.},
  year = {2009},
  volume = {44},
  pages = {90--98},
  number = {7},
  abstract = {Embedded systems, like general-purpose systems, can benefit from parallel
	execution on a symmetric multicore platform. Unfortunately, concurrency
	issues present in general-purpose programming also apply to embedded
	systems, protection from which is currently only offered with performance-limiting
	coarse-grained locking or error-prone and difficult-to-implement
	fine-grained locking. Transactional memory offers relief from these
	mechanisms, but has primarily been investigated on general-purpose
	systems. In this paper, we present Embedded Software Transactional
	Memory {(ESTM)} as a novel solution to the concurrency problem in
	parallel embedded applications. We investigate common software transactional
	memory design decisions and discuss the best decisions for an embedded
	platform. We offer a full implementation of an embedded {STM} and
	test it against both coarse-grained and fine-grained locking mechanisms.
	We find that we can meet or beat the performance of fine-grained
	locking over a range of application characteristics, including size
	of shared data, time spent in the critical section, and contention
	between threads. Our {ESTM} implementation benefits from the effective
	use of L1 memory, a feature which is built into our {STM} model but
	which cannot be directly utilized by traditional locking mechanisms.},
  doi = {10.1145/1543136.1542465},
  keywords = {embedded systems, locking, multicore, ok, resumed, software transactional
	memory (stm), synchronization, transactions},
  url = {http://portal.acm.org/citation.cfm?id=1543136.1542465}
}

@ARTICLE{marathe_lowering_2006,
  author = {Marathe, Virendra J and Spear, Michael F and Heriot, Christopher
	and Acharya, Athul and Eisenstat, David and Iii, William N. Scherer
	and Scott, Michael L},
  title = {Lowering the overhead of nonblocking software transactional memory},
  journal = {{DEPT.} {OF} {COMPUTER} {SCIENCE}, {UNIV.} {OF} {ROCHESTER}},
  year = {2006},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.118.1331}
}

@INPROCEEDINGS{marathe_scalable_2008,
  author = {Marathe, Virendra J. and Spear, Michael F. and Scott, Michael L.},
  title = {Scalable Techniques for Transparent Privatization in Software Transactional
	Memory},
  booktitle = {Proceedings of the 2008 37th International Conference on Parallel
	Processing},
  year = {2008},
  pages = {67--74},
  publisher = {{IEEE} Computer Society},
  abstract = {We address the recently recognized privatization problem in software
	transactional memory {(STM)} runtimes, and introduce the notion of
	partially visible reads {(PVRs)} to heuristically reduce the overhead
	of transparent privatization. Specifically, {PVRs} avoid the need
	for a "privatization fence" in the absence of conflict with concurrent
	readers. We present several techniques to trade off the cost of enforcing
	partial visibility with the precision of conflict detection. We also
	consider certain special-case variants of our approach, e.g., for
	predominantly read-only workloads. We compare our implementations
	to prior techniques on a multicore Niagara1 system using a variety
	of artificial workloads. Our results suggest that while no one technique
	performs best in all cases, a dynamic hybrid of {PVRs} and strict
	in-order commits is stable and reasonably fast across a wide range
	of load parameters. At the same time, the remaining overheads are
	high enough to suggest the need for programming model or architectural
	support.},
  isbn = {978-0-7695-3374-2},
  keywords = {performance, privatization, stm},
  url = {http://portal.acm.org/citation.cfm?id=1442492}
}

@ARTICLE{markoff_military_2008,
  author = {Markoff, John},
  title = {Military Supercomputer Sets Record},
  journal = {The New York Times},
  year = {2008},
  month = jun,
  chapter = {Technology},
  issn = {0362-4331},
  keywords = {Computer Chips, Computers and the Internet, International Business
	Machines Corp, Laboratories and Scientific Equipment, Los Alamos
	National Laboratory, New Mexico, Roadrunner, Science and Technology,
	United States Armament and Defense},
  url = {http://www.nytimes.com/2008/06/09/technology/09petaflops.html?_r=2&oref=slogin}
}

@ARTICLE{milo_martin_subtleties_2006,
  author = {Milo Martin and Colin Blundell and E. Lewis},
  title = {Subtleties of Transactional Memory Atomicity Semantics},
  journal = {Computer Architecture Letters},
  year = {2006},
  volume = {5},
  pages = {17},
  number = {2},
  abstract = {Transactional memory has great potential for simplifying multithreaded
	programming by allowing programmers to specify regions of the program
	that must appear to execute atomically. Transactional memory implementations
	then optimistically execute these transactions concurrently to obtain
	high performance. This work shows that the same atomic guarantees
	that give transactions their power also have unexpected and potentially
	serious negative effects on programs that were written assuming narrower
	scopes of atomicity. We make four contributions: (1) we show that
	a direct translation of lock-based critical sections into transactions
	can introduce deadlock into otherwise correct programs, (2) we introduce
	the terms strong atomicity and weak atomicity to describe the interaction
	of transactional and non-transactional code, (3) we show that code
	that is correct under weak atomicity can deadlock under strong atomicity,
	and (4) we demonstrate that sequentially composing transactional
	code can also introduce deadlocks. These observations invalidate
	the intuition that transactions are strictly safer than lock-based
	critical sections, that strong atomicity is strictly safer than weak
	atomicity, and that transactions are always composable},
  doi = {10.1109/L-CA.2006.18},
  issn = {1556-6056},
  keywords = {and parallel languages, Computer Systems Organization, Concurrent,
	deadlock, direct translation, distributed, Language Classifications,
	lock-based critical sections, Multi-core/single-chip multiprocessors,
	Multiple Data Stream Architectures {(Multiprocessors)}, multithreaded
	programming, multi-threading, nontransactional code, operating systems
	(computers), Parallel Architectures, Processor Architectures, Programming
	Languages, program verification, read if time, sequentially composing
	transactional code, {Software/Software} Engineering, strong atomicity,
	transactional memory atomicity semantics, transaction processing,
	weak atomicity}
}

@INPROCEEDINGS{martinez_speculative_2002,
  author = {Martinez, Jos\'{e} and Torrellas, Josep},
  title = {Speculative Synchronization: Applying {Thread-Level} Speculation
	to Explicitly Parallel Applications},
  booktitle = {International Conference on Architectural Support for Programming
	Languages and Operating Systems {(ASPLOS)}},
  year = {2002},
  pages = {29, 18},
  abstract = {Barriers, locks, and flags are synchronizing operations widely used
	by programmers and parallelizing compilers to produce race-free parallel
	programs. Often times, these operations are placed suboptimally,
	either because of conservative assumptions about the program, or
	merely for code simplicity.},
  keywords = {cs533},
  shorttitle = {Speculative Synchronization},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.9209}
}

@INPROCEEDINGS{mattson_introduction_2006,
  author = {Mattson, Tim},
  title = {Introduction to {OpenMP}},
  booktitle = {Proceedings of the 2006 {ACM/IEEE} conference on Supercomputing},
  year = {2006},
  pages = {209},
  address = {Tampa, Florida},
  publisher = {{ACM}},
  abstract = {The {OpenMP} Application Programming Interface {(API)} defines compiler
	directives and library routines that make it relatively easy to create
	parallel programs. It first appeared in 1997 and has become the de
	facto standard for programming shared memory computers. Recent advances
	in {OpenMP} technology are expanding its reach to distributed memory
	systems (e.g. clusters) as {well.In} this tutorial, we will provide
	a comprehensive introduction to {OpenMP.} By dedicating a half-day
	tutorial to the {API} itself, we will be able to cover every construct
	within the language and show how {OpenMP} is used to program shared
	memory multiprocessor computers, multi-core {CPUs} and clusters.},
  doi = {10.1145/1188455.1188673},
  isbn = {0-7695-2700-0},
  url = {http://portal.acm.org/ft_gateway.cfm?id=1188673&type=html&coll=GUIDE&dl=GUIDE&CFID=110722940&CFTOKEN=21985202}
}

@INPROCEEDINGS{maydan_array-data_1993,
  author = {Maydan, Dror E. and Amarasinghe, Saman P. and Lam, Monica S.},
  title = {Array-data flow analysis and its use in array privatization},
  booktitle = {Proceedings of the 20th {ACM} {SIGPLAN-SIGACT} symposium on Principles
	of programming languages - {POPL} '93},
  year = {1993},
  pages = {2--15},
  address = {Charleston, South Carolina, United States},
  doi = {10.1145/158511.158515},
  url = {http://dl.acm.org/citation.cfm?id=158515}
}

@INPROCEEDINGS{mehrara_parallelizing_2009,
  author = {Mehrara, Mojtaba and Hao, Jeff and Hsu, {Po-Chun} and Mahlke, Scott},
  title = {Parallelizing sequential applications on commodity hardware using
	a low-cost software transactional memory},
  booktitle = {Proceedings of the 2009 {ACM} {SIGPLAN} conference on Programming
	language design and implementation},
  year = {2009},
  pages = {166--176},
  address = {Dublin, Ireland},
  publisher = {{ACM}},
  abstract = {Multicore designs have emerged as the mainstream design paradigm for
	the microprocessor industry. Unfortunately, providing multiple cores
	does not directly translate into performance for most applications.
	The industry has already fallen short of the decades-old performance
	trend of doubling performance every 18 months. An attractive approach
	for exploiting multiple cores is to rely on tools, both compilers
	and runtime optimizers, to automatically extract threads from sequential
	applications. However, despite decades of research on automatic parallelization,
	most techniques are only effective in the scientific and data parallel
	domains where array dominated codes can be precisely analyzed by
	the compiler. Thread-level speculation offers the opportunity to
	expand parallelization to general-purpose programs, but at the cost
	of expensive hardware support. In this paper, we focus on providing
	low-overhead software support for exploiting speculative parallelism.
	We propose {STMlite}, a light-weight software transactional memory
	model that is customized to facilitate profile-guided automatic loop
	parallelization. {STMlite} eliminates a considerable amount of checking
	and locking overhead in conventional software transactional memory
	models by decoupling the commit phase from main transaction execution.
	Further, strong atomicity requirements for generic transactional
	memories are unnecessary within a stylized automatic parallelization
	framework. {STMlite} enables sequential applications to extract meaningful
	performance gains on commodity multicore hardware.},
  doi = {10.1145/1542476.1542495},
  isbn = {978-1-60558-392-1},
  keywords = {automatic parallelization, loop level parallelism, profile-guided
	optimization, software transactional memory, thread-level speculation},
  url = {http://portal.acm.org/citation.cfm?id=1542495}
}

@INPROCEEDINGS{menon_practical_2008,
  author = {Menon, Vijay and Balensiefer, Steven and Shpeisman, Tatiana and {Adl-Tabatabai},
	{Ali-Reza} and Hudson, Richard L. and Saha, Bratin and Welc, Adam},
  title = {Practical weak-atomicity semantics for java stm},
  booktitle = {Proceedings of the twentieth annual symposium on Parallelism in algorithms
	and architectures},
  year = {2008},
  pages = {314--325},
  address = {Munich, Germany},
  publisher = {{ACM}},
  abstract = {As memory transactions have been proposed as a language-level replacement
	for locks, there is growing need for well-defined semantics. In contrast
	to database transactions, transaction memory {(TM)} semantics are
	complicated by the fact that programs may access the same memory
	locations both inside and outside transactions. Strongly atomic semantics,
	where non transactional accesses are treated as implicit single-operation
	transactions, remain difficult to provide without specialized hardware
	support or significant performance overhead. As an alternative, many
	in the community have informally proposed that a single global lock
	semantics [18,10], where transaction semantics are mapped to those
	of regions protected by a single global lock, provide an intuitive
	and efficiently implementable model for programmers.},
  doi = {10.1145/1378533.1378588},
  isbn = {978-1-59593-973-9},
  keywords = {java, memory models, P, programming language semantics, read ?, transactional
	memory, weak atomicity},
  url = {http://portal.acm.org/citation.cfm?doid=1378533.1378588}
}

@INPROCEEDINGS{milovanovi_multithreaded_2007,
  author = {Milovanovi\'{c}, M. and Ferrer, R. and Gajinov, V. and Unsal, O.
	S and Cristal, A. and Ayguad\'{e}, E. and Valero, M.},
  title = {Multithreaded software transactional memory and {OpenMP}},
  booktitle = {Proceedings of the 2007 workshop on {MEmory} performance: {DEaling}
	with Applications, systems and architecture},
  year = {2007},
  pages = {81{\textendash}88},
  keywords = {read yes}
}

@INCOLLECTION{moore_cramming_2000,
  author = {Moore, Gordon E.},
  title = {Cramming more components onto integrated circuits},
  booktitle = {Readings in computer architecture},
  publisher = {Morgan Kaufmann Publishers Inc.},
  year = {2000},
  pages = {56--59},
  isbn = {1-55860-539-8},
  url = {http://portal.acm.org/citation.cfm?id=333074}
}

@ARTICLE{moore_logtm:_2006,
  author = {Moore, Kevin E and Bobba, Jayaram and Moravan, Michelle J and Hill,
	Mark D and Wood, David A},
  title = {Logtm: Log-based transactional memory},
  journal = {{IN} {HPCA}},
  year = {2006},
  pages = {254---265},
  keywords = {ok, resumed},
  shorttitle = {Logtm},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.114.5680}
}

@BOOK{muchnick_advanced_1997,
  title = {Advanced compiler design and implementation},
  publisher = {Morgan Kaufmann},
  year = {1997},
  author = {Muchnick, S. S}
}

@BOOK{nguyen_gpu_2007,
  title = {{GPU} Gems 3},
  publisher = {{Addison-Wesley} Professional},
  year = {2007},
  author = {Nguyen, Hubert},
  abstract = {The {GPU} Gems series features a collection of the most essential
	algorithms required by {Next-Generation} {3D} {Engines.Martin} Mittring,
	Lead Graphics Programmer, {CrytekThis} third volume of the best-selling
	{GPU} Gems series provides a snapshot of todays latest Graphics Processing
	Unit {(GPU)} programming techniques. The programmability of modern
	{GPUs} allows developers to not only distinguish themselves from
	one another but also to use this awesome processing power for non-graphics
	applications, such as physics simulation, financial analysis, and
	even virus detectionparticularly with the {CUDA} architecture. Graphics
	remains the leading application for {GPUs}, and readers will find
	that the latest algorithms create ultra-realistic characters, better
	lighting, and post-rendering compositing effects. Major topics {includeGeometryLight}
	and {ShadowsRenderingImage} {EffectsPhysics} {SimulationGPU} {ComputingContributors}
	are from the following corporations and {universities:3DfactoAdobe}
	{SystemsAppleBudapest} University of Technology and {EconomicsCGGVeritasThe}
	Chinese University of Hong {KongCornell} {UniversityCrytekCzech}
	Technical University in {PragueDartmouth} {CollegeDigital} Illusions
	Creative {EntertainmentEindhoven} University of {TechnologyElectronic}
	{ArtsHavokHelsinki} University of {TechnologyImperial} College {LondonInfinity}
	{WardJuniper} {NetworksLaBRIINRIA}, University of Bordeauxmental
	{imagesMicrosoft} {ResearchMove} {InteractiveNCsoft} {CorporationNVIDIA}
	{CorporationPerpetual} {EntertainmentPlaylogic} Game {FactoryPolytimeRainbow}
	{StudiosSEGA} {CorporationUFRGS} {(Brazil)Ulm} {UniversityUniversity}
	of California, {DavisUniversity} of Central {FloridaUniversity} of
	{CopenhagenUniversity} of {GironaUniversity} of Illinois at {Urbana-ChampaignUniversity}
	of North Carolina Chapel {HillUniversity} of {TokyoUniversity} of
	{WaterlooSection} Editors include {NVIDIA} engineers: Cyril Zeller,
	Evan Hart, Ignacio Castao, Kevin Bjorke, Kevin Myers, and Nolan {Goodnight.The}
	accompanying {DVD} includes complementary examples and sample programs.},
  isbn = {9780321515261},
  url = {http://portal.acm.org/citation.cfm?id=1536934&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@BOOK{nguyen_gpu_2007-1,
  title = {Gpu gems 3},
  publisher = {{Addison-Wesley} Professional},
  year = {2007},
  author = {Nguyen, Hubert},
  abstract = {{{\textquotedblleft}The} {GPU} Gems series features a collection of
	the most essential algorithms required by {Next-Generation} {3D}
	Engines.{\textquotedblright} {-Martin} Mittring, Lead Graphics Programmer,
	Crytek This third volume of the best-selling {GPU} Gems series provides
	a snapshot of today's latest Graphics Processing Unit {(GPU)} programming
	techniques. The programmability of modern {GPUs} allows developers
	to not only distinguish themselves from one another but also to use
	this awesome processing power for non-graphics applications, such
	as physics simulation, financial analysis, and even virus detection-particularly
	with the {CUDA} architecture. Graphics remains the leading application
	for {GPUs}, and readers will find that the latest algorithms create
	ultra-realistic characters, better lighting, and post-rendering compositing
	{effects.Major} topics include Geometry Light and Shadows Rendering
	Image Effects Physics Simulation {GPU} Computing Contributors are
	from the following corporations and {universities:3Dfacto} Adobe
	Systems Apple Budapest University of Technology and Economics {CGGVeritas}
	The Chinese University of Hong Kong Cornell University Crytek Czech
	Technical University in Prague Dartmouth College Digital Illusions
	Creative Entertainment Eindhoven University of Technology Electronic
	Arts Havok Helsinki University of Technology Imperial College London
	Infinity Ward Juniper Networks {LaBRI\"{i}{\textthreequarters}?INRIA},
	University of Bordeaux mental images Microsoft Research Move Interactive
	{NCsoft} Corporation {NVIDIA} Corporation Perpetual Entertainment
	Playlogic Game Factory Polytime Rainbow Studios {SEGA} Corporation
	{UFRGS} {(Brazil)} Ulm University University of California, Davis
	University of Central Florida University of Copenhagen University
	of Girona University of Illinois at {Urbana-Champaign} University
	of North Carolina Chapel Hill University of Tokyo University of {WaterlooSection}
	Editors include {NVIDIA} engineers: Cyril Zeller, Evan Hart, Ignacio
	Casta\~{n}o, Kevin Bjorke, Kevin Myers, and Nolan {Goodnight.The}
	accompanying {DVD} includes complementary examples and sample programs.},
  isbn = {9780321545428},
  url = {http://portal.acm.org/citation.cfm?id=1407436&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@ARTICLE{nickolls_scalable_2008,
  author = {Nickolls, John and Buck, Ian and Garland, Michael and Skadron, Kevin},
  title = {Scalable Parallel Programming with {CUDA}},
  journal = {Queue},
  year = {2008},
  volume = {6},
  pages = {40--53},
  number = {2},
  abstract = {The advent of multicore {CPUs} and manycore {GPUs} means that mainstream
	processor chips are now parallel systems. Furthermore, their parallelism
	continues to scale with Moore's law. The challenge is to develop
	mainstream application software that transparently scales its parallelism
	to leverage the increasing number of processor cores, much as {3D}
	graphics applications transparently scale their parallelism to manycore
	{GPUs} with widely varying numbers of cores.},
  doi = {10.1145/1365490.1365500},
  url = {http://portal.acm.org/ft_gateway.cfm?id=1365500&type=html&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@INPROCEEDINGS{novillo_gcc_2007,
  author = {Novillo, D.},
  title = {{GCC} internals},
  booktitle = {International Symposium on Code Generation and Optimization {(CGO)},
	San Jose, California},
  year = {2007}
}

@INPROCEEDINGS{novillo_openmp_2006,
  author = {Novillo, D.},
  title = {{OpenMP} and automatic parallelization in {GCC}},
  booktitle = {{GCC} developers summit},
  year = {2006}
}

@INPROCEEDINGS{oancea_lightweight_2009,
  author = {Oancea, Cosmin E. and Mycroft, Alan and Harris, Tim},
  title = {A lightweight in-place implementation for software thread-level speculation},
  booktitle = {Proceedings of the twenty-first annual symposium on Parallelism in
	algorithms and architectures},
  year = {2009},
  pages = {223--232},
  address = {Calgary, {AB}, Canada},
  publisher = {{ACM}},
  abstract = {Thread-level speculation {(TLS)} is a technique that allows parts
	of a sequential program to be executed in parallel. {TLS} ensures
	the parallel program's behaviour remains true to the language's original
	sequential semantics; for example, allowing multiple iterations of
	a loop to run in parallel if there are no conflicts between them.},
  doi = {10.1145/1583991.1584050},
  isbn = {978-1-60558-606-9},
  keywords = {read yes, roll-back, thread-level speculation (tls)},
  url = {http://portal.acm.org/citation.cfm?id=1583991.1584050}
}

@ARTICLE{oplinger_software_1997,
  author = {Oplinger, Jeffrey and Heine, David and Liao, Shih-wei and Nayfeh,
	Basem A and Lam, Monica S and Olukotun, Kunle},
  title = {Software and Hardware for Exploiting Speculative Parallelism with
	a Multiprocessor},
  year = {1997},
  pages = {97---715},
  keywords = {read no},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.22.9644}
}

@INPROCEEDINGS{ottoni_automatic_2005,
  author = {Ottoni, Guilherme and Rangan, Ram and Stoler, Adam and August, David
	I.},
  title = {Automatic Thread Extraction with Decoupled Software Pipelining},
  booktitle = {Proceedings of the 38th annual {IEEE/ACM} International Symposium
	on Microarchitecture},
  year = {2005},
  pages = {105--118},
  address = {Barcelona, Spain},
  publisher = {{IEEE} Computer Society},
  abstract = {Until recently, a steadily rising clock rate and other uniprocessor
	microarchitectural improvements could be relied upon to consistently
	deliver increasing performance for a wide range of applications.
	Current difficulties in maintaining this trend have lead microprocessor
	manufacturers to add value by incorporating multiple processors on
	a chip. Unfortunately, since decades of compiler research have not
	succeeded in delivering automatic threading for prevalent code properties,
	this approach demonstrates no improvement for a large class of existing
	codes. To find useful work for chip multiprocessors, we propose an
	automatic approach to thread extraction, called Decoupled Software
	Pipelining {(DSWP).} {DSWP} exploits the finegrained pipeline parallelism
	lurking in most applications to extract long-running, concurrently
	executing threads. Use of the non-speculative and truly decoupled
	threads produced by {DSWP} can increase execution efficiency and
	provide significant latency tolerance, mitigating design complexity
	by reducing inter-core communication and per-core resource requirements.
	Using our initial fully automatic compiler implementation and a validated
	processor model, we prove the concept by demonstrating significant
	gains for dual-core chip multiprocessor models running a variety
	of codes. We then explore simple opportunities missed by our initial
	compiler implementation which suggest a promising future for this
	approach.},
  isbn = {0-7695-2440-0},
  url = {http://portal.acm.org/citation.cfm?id=1099547.1100543}
}

@INPROCEEDINGS{packirisamy_exploring_2009,
  author = {Packirisamy, Venkatesan and Zhai, Antonia and {Wei-Chung} Hsu and
	Yew, {Pen-Chung} and Ngai, {Tin-Fook}},
  title = {Exploring speculative parallelism in {SPEC2006}},
  booktitle = {2009 {IEEE} International Symposium on Performance Analysis of Systems
	and Software},
  year = {2009},
  pages = {77--88},
  address = {Boston, {MA}, {USA}},
  month = apr,
  doi = {10.1109/ISPASS.2009.4919640},
  url = {http://www.bibsonomy.org/bibtex/2f3e1144f9c60ceb6e6491ea82a5b8fec/dblp}
}

@ARTICLE{padua_polaris:_1993,
  author = {Padua, David A and Eigenmann, Rudolf and Hoeflinger, Jay and Petersen,
	Paul and Tu, Peng and Weatherford, Stephen and Faigin, Keith},
  title = {Polaris: A {New-Generation} Parallelizing Compiler for {MPPs}},
  journal = {{IN} {CSRD} {REPT.} {NO.} 1306. {UNIV.} {OF} {ILLINOIS} {AT} {URBANA-CHAMPAIGN}},
  year = {1993},
  shorttitle = {Polaris},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.6716}
}

@ARTICLE{papadimitriou_serializability_1979,
  author = {Papadimitriou, Christos H.},
  title = {The serializability of concurrent database updates},
  journal = {J. {ACM}},
  year = {1979},
  volume = {26},
  pages = {631--653},
  number = {4},
  abstract = {Note: {OCR} errors may be found in this Reference List extracted from
	the full text article. {ACM} has opted to expose the complete List
	rather than only correct and linked references.},
  doi = {10.1145/322154.322158},
  url = {http://portal.acm.org/citation.cfm?id=322158&dl=GUIDE,}
}

@ARTICLE{pickett_memory_????,
  author = {Pickett, C. {J.F} and Verbrugge, C. and Kielstra, A.},
  title = {Memory Abstractions for Speculative Multithreading},
  keywords = {read no}
}

@ARTICLE{pop_graphite:_2006,
  author = {Pop, Sebastian and Cohen, Albert and Bastoul, C\'{e}dric and Girbal,
	Sylvain and Silber, Georges-andr\'{e} and Vasilache, Nicolas},
  title = {{GRAPHITE:} Polyhedral Analyses and Optimizations for {GCC}},
  journal = {In proceedings of the 2006 {GCC} developers summit},
  year = {2006},
  pages = {2006},
  shorttitle = {{GRAPHITE}},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.77.9149}
}

@ARTICLE{porter_mapping_????,
  author = {Porter, L. and Choi, B. and Tullsen, D. M},
  title = {Mapping Out a Path from Hardware Transactional Memory to Speculative
	Multithreading},
  keywords = {ok}
}

@BOOK{pouchet_iterative_2010,
  title = {Iterative Optimization in the Polyhedral Model},
  publisher = {Universit\'{e} de {Paris-Sud} 11},
  year = {2010},
  author = {Pouchet, Louis Noel},
  address = {Orsay, France}
}

@INPROCEEDINGS{prabhu_using_2003,
  author = {Prabhu, Manohar K. and Olukotun, Kunle},
  title = {Using thread-level speculation to simplify manual parallelization},
  booktitle = {Proceedings of the ninth {ACM} {SIGPLAN} symposium on Principles
	and practice of parallel programming},
  year = {2003},
  pages = {1--12},
  address = {San Diego, California, {USA}},
  publisher = {{ACM}},
  abstract = {In this paper, we provide examples of how thread-level speculation
	{(TLS)} simplifies manual parallelization and enhances its performance.
	A number of techniques for manual parallelization using {TLS} are
	presented and results are provided that indicate the performance
	contribution of each technique on seven {SPEC} {CPU2000} benchmark
	applications. We also provide indications of the programming effort
	required to parallelize each benchmark. {TLS} parallelization yielded
	a 110\% speedup on our four floating point applications and a 70\%
	speedup on our three integer applications, while requiring only approximately
	80 programmer hours and 150 lines of non-template code per application.
	These results support the idea that manual parallelization using
	{TLS} is an efficient way to extract fine-grain thread-level parallelism.},
  doi = {10.1145/781498.781500},
  isbn = {1-58113-588-2},
  keywords = {chip multiprocessor, data speculation, feedback-driven optimization,
	manual parallel programming, multithreading},
  url = {http://portal.acm.org/citation.cfm?id=781498.781500}
}

@InProceedings{Cal90,
  author = 	 {D. Callahan and S. Carr and K. Kennedy},
  title = 	 {Improving Register Allocation for Subscripted Variables},
  booktitle = 	 pldi # { (PLDI'90)},
  year =	 1990,
  address =	 {White Plains, New York},
  month =	 jun
}

@INPROCEEDINGS{raman_parallel-stage_2008,
  author = {Raman, Easwaran and Ottoni, Guilherme and Raman, Arun and Bridges,
	Matthew J. and August, David I.},
  title = {Parallel-stage decoupled software pipelining},
  booktitle = {Proceedings of the sixth annual {IEEE/ACM} international symposium
	on Code generation and optimization},
  year = {2008},
  pages = {114--123},
  address = {Boston, {MA}, {USA}},
  publisher = {{ACM}},
  abstract = {In recent years, the microprocessor industry has embraced chip multiprocessors
	{(CMPs)}, also known as multi-core architectures, as the dominant
	design paradigm. For existing and new applications to make effective
	use of {CMPs}, it is desirable that compilers automatically extract
	thread-level parallelism from single-threaded applications. {DOALL}
	is a popular automatic technique for loop-level parallelization employed
	successfully in the domains of scientific and numeric computing.
	While {DOALL} generally scales well with the number of iterations
	of the loop, its applicability is limited by the presence of loop-carried
	dependences. A parallelization technique with greater applicability
	is decoupled software pipelining {(DSWP)}, which parallelizes loops
	even in the presence of loop-carried dependences. However, the scalability
	of {DSWP} is limited by the size of the loop body and the number
	of recurrences it contains, which are usually smaller than the loop
	iteration count.},
  doi = {10.1145/1356058.1356074},
  isbn = {978-1-59593-978-4},
  keywords = {automatic parallelization, doall, dswp, multi-core architectures,
	pipelined parallelism, tlp},
  url = {http://portal.acm.org/citation.cfm?id=1356058.1356074}
}

@ARTICLE{rauchwerger_run-time_1998,
  author = {Rauchwerger, Lawrence},
  title = {Run-time parallelization: its time has come},
  journal = {Parallel Comput.},
  year = {1998},
  volume = {24},
  pages = {527--556},
  number = {3-4},
  keywords = {compiler, debugging, inspector/executor, irregular applications, parallelization,
	pointer aliasing, run-time, scheduling, speculative, subscripted
	scripts},
  shorttitle = {Run-time parallelization},
  url = {http://portal.acm.org/citation.cfm?id=287948}
}

@ARTICLE{rauchwerger_lrpd_1999,
  author = {Rauchwerger, Lawrence and Padua, David A.},
  title = {The {LRPD} Test: Speculative {Run-Time} Parallelization of Loops
	with Privatization and Reduction Parallelization},
  journal = {{IEEE} Trans. Parallel Distrib. Syst.},
  year = {1999},
  volume = {10},
  pages = {160--180},
  number = {2},
  abstract = {Current parallelizing compilers cannot identify a significant fraction
	of parallelizable loops because they have complex or statically insufficiently
	defined access patterns. As parallelizable loops arise frequently
	in practice, we advocate a novel framework for their identification:
	speculatively execute the loop as a doall and apply a fully parallel
	data dependence test to determine if it had any cross-iteration dependences;
	if the test fails, then the loop is reexecuted serially. Since, from
	our experience, a significant amount of the available parallelism
	in Fortran programs can be exploited by loops transformed through
	privatization and reduction parallelization, our methods can speculatively
	apply these transformations and then check their validity at run-time.
	Another important contribution of this paper is a novel method for
	reduction recognition which goes beyond syntactic pattern matching:
	It detects at run-time if the values stored in an array participate
	in a reduction operation, even if they are transferred through private
	variables and/or are affected by statically unpredictable control
	flow. We present experimental results on loops from the {PERFECT}
	Benchmarks, which substantiate our claim that these techniques can
	yield significant speedups which are often superior to those obtainable
	by inspector/executor methods},
  keywords = {compilers, doall, ok, p++, parallel processing, privatization., reduction,
	run-time, speculative},
  shorttitle = {The {LRPD} Test},
  url = {http://portal.acm.org/citation.cfm?id=298812}
}

@INPROCEEDINGS{rus_sensitivity_2007,
  author = {Rus, Silvius and Pennings, Maikel and Rauchwerger, Lawrence},
  title = {Sensitivity analysis for automatic parallelization on multi-cores},
  booktitle = {Proceedings of the 21st annual international conference on Supercomputing},
  year = {2007},
  pages = {263--273},
  address = {Seattle, Washington},
  publisher = {{ACM}},
  abstract = {Sensitivity Analysis {(SA)} is a novel compiler technique that complements,
	and integrates with, static automatic parallelization analysis for
	the cases when relevant program behavior is input sensitive. In this
	paper we show how {SA} can extract all the input dependent, statically
	unavailable, conditions for which loops can be dynamically parallelized.
	{SA} generates a sequence of sufficient conditions which, when evaluated
	dynamically in order of their complexity, can each validate the dynamic
	parallel execution of the corresponding loop. For example, {SA} can
	first attempt to validate parallelization by checking simple conditions
	related to loop bounds. If such simple conditions cannot be met,
	then validating dynamic parallelization may require evaluating conditions
	related to the entire memory reference trace of a loop, thus decreasing
	the benefits of parallel execution.},
  doi = {10.1145/1274971.1275008},
  isbn = {978-1-59593-768-1},
  keywords = {multicore, parallelism, sensitivity analysis},
  url = {http://portal.acm.org/citation.cfm?id=1274971.1275008}
}

@ARTICLE{rus_hybrid_2003,
  author = {Rus, Silvius and Rauchwerger, Lawrence and Hoeflinger, Jay},
  title = {Hybrid Analysis: Static \& Dynamic Memory Reference Analysis},
  journal = {International Journal of Parallel Programming},
  year = {2003},
  volume = {31},
  pages = {251--283},
  number = {4},
  abstract = {We present a novel Hybrid Analysis technology which can efficiently
	and seamlessly integrate all static and run-time analysis of memory
	references into a single framework that is capable of performing
	all data dependence analysis and can generate necessary information
	for most associated memory related optimizations. We use {HA} to
	perform automatic parallelization by extracting run-time assertions
	from any loop and generating appropriate run-time tests that range
	from a low cost scalar comparison to a full, reference by reference
	run-time analysis. Moreover we can order the run-time tests in increasing
	order of complexity (overhead) and thus risk the minimum necessary
	overhead. We accomplish this by both extending compile time {IP}
	analysis techniques and by incorporating speculative run-time techniques
	when necessary. Our solution is to bridge {\textquotedblleft}free{\textquotedblright}
	compile time techniques with exhaustive run-time techniques through
	a continuum of simple to complex solutions. We have implemented our
	framework in the Polaris compiler by introducing an innovative intermediate
	representation called {RT\_LMAD} and a run-time library that can
	operate on it. Based on the experimental results obtained to date
	we hope to automatically parallelize most and possibly all {PERFECT}
	codes, a significant accomplishment.},
  doi = {10.1023/A:1024597010150},
  shorttitle = {Hybrid Analysis},
  url = {http://dx.doi.org/10.1023/A:1024597010150}
}

@INPROCEEDINGS{saha_mcrt-stm:_2006,
  author = {Saha, Bratin and {Adl-Tabatabai}, {Ali-Reza} and Hudson, Richard
	L. and Minh, Chi Cao and Hertzberg, Benjamin},
  title = {{McRT-STM:} a high performance software transactional memory system
	for a multi-core runtime},
  booktitle = {Proceedings of the eleventh {ACM} {SIGPLAN} symposium on Principles
	and practice of parallel programming},
  year = {2006},
  pages = {187--197},
  address = {New York, New York, {USA}},
  publisher = {{ACM}},
  abstract = {Applications need to become more concurrent to take advantage of the
	increased computational power provided by chip level multiprocessing.
	Programmers have traditionally managed this concurrency using locks
	(mutex based synchronization). Unfortunately, lock based synchronization
	often leads to deadlocks, makes fine-grained synchronization difficult,
	hinders composition of atomic primitives, and provides no support
	for error recovery. Transactions avoid many of these problems, and
	therefore, promise to ease concurrent {programming.We} describe a
	software transactional memory {(STM)} system that is part of {McRT},
	an experimental {Multi-Core} {RunTime.} The {McRT-STM} implementation
	uses a number of novel algorithms, and supports advanced features
	such as nested transactions with partial aborts, conditional signaling
	within a transaction, and object based conflict detection for {C/C++}
	applications. The {McRT-STM} exports interfaces that can be used
	from {C/C++} programs directly or as a target for compilers translating
	higher level linguistic {constructs.We} present a detailed performance
	analysis of various {STM} design tradeoffs such as pessimistic versus
	optimistic concurrency, undo logging versus write buffering, and
	cache line based versus object based conflict detection. We also
	show a {MCAS} implementation that works on arbitrary values, coexists
	with the {STM}, and can be used as a more efficient form of transactional
	memory. To provide a baseline we compare the performance of the {STM}
	with that of fine-grained and coarse-grained locking using a number
	of concurrent data structures on a 16-processor {SMP} system. We
	also show our {STM} performance on a non-synthetic workload -- the
	Linux sendmail application.},
  doi = {10.1145/1122971.1123001},
  isbn = {1-59593-189-9},
  keywords = {atomic constructs, read yes, runtime environment, software transactional
	memory, two-phase locking and read-versioning},
  shorttitle = {{McRT-STM}},
  url = {http://portal.acm.org/citation.cfm?id=1122971.1123001}
}

@INPROCEEDINGS{saito_large_2002,
  author = {Saito, Hideki and Gaertner, Greg and Jones, Wesley B. and Eigenmann,
	Rudolf and Iwashita, Hidetoshi and Lieberman, Ron and Waveren, G.
	Matthijs van and Whitney, Brian},
  title = {Large System Performance of {SPEC} {OMP2001} Benchmarks},
  booktitle = {Proceedings of the 4th International Symposium on High Performance
	Computing},
  year = {2002},
  pages = {370--379},
  publisher = {{Springer-Verlag}},
  isbn = {{3-540-43674-X}},
  url = {http://portal.acm.org/citation.cfm?id=690717}
}

@INPROCEEDINGS{schindewolf_towards_2009,
  author = {Schindewolf, M. and Cohen, A. and Karl, W. and Marongiu, A. and Benini,
	L.},
  title = {Towards Transactional Memory Support for {GCC}},
  booktitle = {First International Workshop on {GCC} Research Opportunities},
  year = {2009},
  keywords = {ok, resumed}
}

@ARTICLE{schouten_inside_2003,
  author = {Schouten, Dale and Tian, Xinmin and Bik, Aart and Girkar, Milind},
  title = {Inside the Intel compiler},
  journal = {Linux J.},
  year = {2003},
  volume = {2003},
  pages = {4},
  number = {106},
  abstract = {The optimizations and features of Intel's complier for the {IA-32}
	architecture.},
  keywords = {read yes},
  url = {http://portal.acm.org/ft_gateway.cfm?id=606621&type=html&coll=GUIDE&dl=GUIDE&CFID=47872533&CFTOKEN=66506476}
}

@ARTICLE{shavit_software_1997,
  author = {Shavit, Nir and Touitou, Dan},
  title = {Software transactional memory},
  journal = {Distributed Computing},
  year = {1997},
  volume = {10},
  pages = {99--116},
  number = {2},
  month = feb,
  abstract = {Summary. As we learn from the literature, flexibility in choosing
	synchronization operations greatly simplifies the task of designing
	highly concurrent programs. Unfortunately, existing hardware is inflexible
	and is at best on the level of a {Load{\textendash}Linked/Store{\textendash}Conditional}
	operation on a single word. Building on the hardware based transactional
	synchronization methodology of Herlihy and Moss, we offer software
	transactional memory {(STM)}, a novel software method for supporting
	flexible transactional programming of synchronization operations.
	{STM} is non-blocking, and can be implemented on existing machines
	using only a {Load{\textendash}Linked/Store{\textendash}Conditional}
	operation. We use {STM} to provide a general highly concurrent method
	for translating sequential object implementations to non-blocking
	ones based on implementing a k-word compare\&swap {STM-transaction.}
	Empirical evidence collected on simulated multiprocessor architectures
	shows that our method always outperforms the non-blocking translation
	methods in the style of Barnes, and outperforms Herlihy{\textquoteright}s
	translation method for sufficiently large numbers of processors.
	The key to the efficiency of our software-transactional approach
	is that unlike Barnes style methods, it is not based on a costly
	{\textquotedblleft}recursive helping{\textquotedblright} policy.},
  doi = {10.1007/s004460050028},
  keywords = {ok},
  url = {http://dx.doi.org/10.1007/s004460050028}
}

@ARTICLE{shavit_diffracting_1996,
  author = {Shavit, Nir and Zemach, Asaph},
  title = {Diffracting trees},
  journal = {{ACM} Trans. Comput. Syst.},
  year = {1996},
  volume = {14},
  pages = {385--428},
  number = {4},
  abstract = {Shared counters are among the most basic coordination structures in
	multiprocessor conputation, with applications ranging from barrier
	synchronization to concurrent-data-structure design. This article
	introduces diffracting trees, novel data structures for share counting
	and load balancing in a distributed/parallel environment. Empirical
	evidence, collected on a simulated distributed shared-memory machine
	and several simulated message-passing architectures, shows that diffracting
	trees scale better and are more robust than both combining trees
	and counting networks, currently the most effective known methods
	for implementing concurrent counters in software. The use of a randomized
	coordination method together with a combinatorial data structure
	overcomes the resiliency drawbacks of combining trees. Our simulations
	show that to handle the same load, diffracting trees and counting
	networks should have a similar width w, yet the depth of a diffracting
	tree is O(log w), whereas counting networks have depth O(log2 w).
	Diffracting trees have already been used to implement highly efficient
	producer/consumer queues, and we believe diffraction will prove to
	be an effective alternative paradigm to combining and queue-locking
	in the design of many concurrent data structures.},
  doi = {10.1145/235543.235546},
  keywords = {contention, counting networks, index distribution, lock free, wait
	free},
  url = {http://portal.acm.org/citation.cfm?id=235543.235546&coll=GUIDE&dl=GUIDE&CFID=49561130&CFTOKEN=28654349}
}

@TECHREPORT{singh_splash:_1991,
  author = {Singh, Jaswinder P and Weber, Wolf and Gupta, Anoop},
  title = {{SPLASH:} Stanford parallel applications for shared-memory},
  institution = {Stanford University},
  year = {1991},
  abstract = {This report was replaced and updated in {CSL-TR-92-526}},
  shorttitle = {{SPLASH}},
  url = {http://portal.acm.org/citation.cfm?id=891800}
}

@INPROCEEDINGS{spear_privatization_2007,
  author = {Spear, Michael F. and Marathe, Virendra J. and Dalessandro, Luke
	and Scott, Michael L.},
  title = {Privatization techniques for software transactional memory},
  booktitle = {Proceedings of the twenty-sixth annual {ACM} symposium on Principles
	of distributed computing},
  year = {2007},
  pages = {338--339},
  address = {Portland, Oregon, {USA}},
  publisher = {{ACM}},
  abstract = {Note: {OCR} errors may be found in this Reference List extracted from
	the full text article. {ACM} has opted to expose the complete List
	rather than only correct and linked references.},
  doi = {10.1145/1281100.1281161},
  isbn = {978-1-59593-616-5},
  keywords = {privatization, software transactional memory, synchronization},
  url = {http://portal.acm.org/citation.cfm?id=1281161}
}

@ARTICLE{steffan_potential_1998,
  author = {Steffan, J. Gregory and Mowry, Todd C},
  title = {The Potential for Using {Thread-Level} Data Speculation to Facilitate
	Automatic Parallelization},
  year = {1998},
  pages = {2---13},
  keywords = {ok},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.33.9954}
}

@ARTICLE{stone_opencl:_2010,
  author = {Stone, John E. and Gohara, David and Shi, Guochun},
  title = {{OpenCL:} A Parallel Programming Standard for Heterogeneous Computing
	Systems},
  journal = {{IEEE} Des. Test},
  year = {2010},
  volume = {12},
  pages = {66--73},
  number = {3},
  abstract = {The {OpenCL} standard offers a common {API} for program execution
	on systems composed of different types of computational devices such
	as multicore {CPUs}, {GPUs}, or other accelerators.},
  shorttitle = {{OpenCL}},
  url = {http://portal.acm.org/citation.cfm?id=622179.1803953&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@INPROCEEDINGS{thies_unified_2001,
  author = {Thies, William and Vivien, Fr\'{e}d\'{e}ric and Sheldon, Jeffrey
	and Amarasinghe, Saman},
  title = {A unified framework for schedule and storage optimization},
  year = {2001},
  booktitle = 	{Proc.\ of the 2001 {PLDI} Conf.},
}

@INPROCEEDINGS{tournavitis_towards_2009,
  author = {Tournavitis, Georgios and Wang, Zheng and Franke, Bj\"{o}rn and {O'Boyle},
	Michael {F.P.}},
  title = {Towards a holistic approach to auto-parallelization: integrating
	profile-driven parallelism detection and machine-learning based mapping},
  booktitle = {Proceedings of the 2009 {ACM} {SIGPLAN} conference on Programming
	language design and implementation},
  year = {2009},
  pages = {177--187},
  address = {Dublin, Ireland},
  publisher = {{ACM}},
  abstract = {Compiler-based auto-parallelization is a much studied area, yet has
	still not found wide-spread application. This is largely due to the
	poor exploitation of application parallelism, subsequently resulting
	in performance levels far below those which a skilled expert programmer
	could achieve. We have identified two weaknesses in traditional parallelizing
	compilers and propose a novel, integrated approach, resulting in
	significant performance improvements of the generated parallel code.
	Using profile-driven parallelism detection we overcome the limitations
	of static analysis, enabling us to identify more application parallelism
	and only rely on the user for final approval. In addition, we replace
	the traditional target-specific and inflexible mapping heuristics
	with a machine-learning based prediction mechanism, resulting in
	better mapping decisions while providing more scope for adaptation
	to different target architectures. We have evaluated our parallelization
	strategy against the {NAS} and {SPEC} {OMP} benchmarks and two different
	multi-core platforms (dual quad-core Intel Xeon {SMP} and dual-socket
	{QS20} Cell blade). We demonstrate that our approach not only yields
	significant improvements when compared with state-of-the-art parallelizing
	compilers, but comes close to and sometimes exceeds the performance
	of manually parallelized codes. On average, our methodology achieves
	96\% of the performance of the hand-tuned {OpenMP} {NAS} and {SPEC}
	parallel benchmarks on the Intel Xeon platform and gains a significant
	speedup for the {IBM} Cell platform, demonstrating the potential
	of profile-guided and machine-learning based parallelization for
	complex multi-core platforms.},
  doi = {10.1145/1542476.1542496},
  isbn = {978-1-60558-392-1},
  keywords = {auto-parallelization, machine-learning based parallelism mapping,
	openmp, profile-driven parallelism detection},
  shorttitle = {Towards a holistic approach to auto-parallelization},
  url = {http://portal.acm.org/citation.cfm?id=1542476.1542496}
}

@MISC{trifunovic_graphite_2010,
  author = {Trifunovic, Konrad and Cohen, Albert and Edelsohn, David and Li,
	Feng and Grosser, Tobias and Jagasia, Harsha and Ladelsky, Razya
	and Pop, Sebastian and Sjodin, Jan and Upadrasta, Ramakrishna},
  title = {{GRAPHITE} Two Years After: First Lessons Learned From {Real-World}
	Polyhedral Compilation},
  month = jan,
  year = {2010},
  shorttitle = {{GRAPHITE} Two Years After}
}

@INCOLLECTION{tu_automatic_2001,
  author = {Tu, Peng and Padua, David},
  title = {Automatic Array Privatization},
  booktitle = {Compiler Optimizations for Scalable Parallel Systems},
  year = {2001},
  pages = {247--281},
  abstract = {This chapter discusses techniques for automatic array privatization
	developed as part of the Polaris project at the University of Illinois
	at {Urbana-Champaign.} Array privatization is one of the most important
	transformations for effective program parallelization.
	
	The array privatization problem is formulatedas data flow equations
	involving sets of scalars andarra y regions. A single-pass algorithm
	to solve these data flow equations is introduced. Finally, this chapter
	presents a demand-driven symbolic analysis algorithm to manipulate
	array regions whose bounds are represented by symbolic expressions.},
  url = {http://dx.doi.org/10.1007/3-540-45403-9_8}
}

@PHDTHESIS{vasilache_scalable_2007,
  author = {Vasilache, Nicolas},
  title = {Scalable Program Optimization Techniques in the Polyhedral Model},
  school = {University of {Paris-Sud}},
  year = {2007},
  address = {{INRIA}},
  month = sep
}

@Article{Gir06,
  author = 	 {Sylvain Girbal and Nicolas Vasilache and C\'edric Bastoul
                  and Albert Cohen and David Parello
                  and Marc Sigler and Olivier Temam},
  title = 	 {Semi-Automatic Composition of Loop Transformations
                  for Deep Parallelism and Memory Hierarchies},
  journal = 	 {Int. J. on Parallel Programming},
  publisher = 	 {Kluwer},
  year =	 2006,
  month =        jun,
  volume =       34,
  number =       3,
  pages =        {261--317},
  note =         {Special issue on Microgrids.}
}

@INPROCEEDINGS{vasilache_violated_2006,
  author = {Vasilache, Nicolas and Bastoul, Cedric and Cohen, Albert and Girbal,
	Sylvain},
  title = {Violated dependence analysis},
  booktitle = {Proceedings of the 20th annual international conference on Supercomputing},
  year = {2006},
  pages = {335--344},
  address = {Cairns, Queensland, Australia},
  publisher = {{ACM}},
  abstract = {The polyhedral model is a powerful framework to reason about high
	level loop transformations. Yet the lack of scalable algorithms and
	tools has deterred actors from both academia and industry to put
	this model to practical use. Indeed, for fundamental complexity reasons,
	its applicability has long been limited to simple kernels. Recent
	developments broke some generally accepted ideas about these limitations.
	In particular, new algorithms made it possible to compute the target
	code for full {SPEC} benchmarks while this code generation step was
	expected not to be {scalable.Instancewise} array dependence analysis
	computes a finite, intensional representation of the (statically
	unbounded) set of all dynamic dependences. This problem has always
	been considered non-scalable and/or an overkill with respect to less
	expressive and faster dependence tests. On the contrary, this article
	presents experimental evidence of its applicability to full {SPEC}
	{CPU2000} benchmarks. To make this possible, we revisit the characterization
	of data dependences, considering relations between time dimensions
	of the transformed space. Beyond algorithmic benefits, this naturally
	leads to a novel way of reasoning about violated dependences across
	arbitrary transformation sequences. Reasoning about violated dependences
	relieves the compiler designer from the cumbersome task of implementing
	specific legality checks for each single transformation. It also
	allows, in the case of invalid transformations, to precisely determine
	the violated dependences that need to be corrected. Identifying these
	violations can in turn enable automatic correction schemes to fix
	an illegal transformation sequence with minimal changes.},
  doi = {10.1145/1183401.1183448},
  isbn = {1-59593-282-8},
  url = {http://portal.acm.org/citation.cfm?id=1183401.1183448}
}

@INCOLLECTION{verdoolaege_isl:_2010,
  author = {Verdoolaege, Sven},
  title = {isl: An Integer Set Library for the Polyhedral Model},
  booktitle = {Mathematical Software - {ICMS} 2010},
  publisher = {Springer Berlin Heidelberg},
  year = {2010},
  editor = {Hoeven, Joris van der and Joswig, Michael and Takayama, Nobuki and
	Fukuda, Komei},
  volume = {6327},
  pages = {299--302},
  address = {Berlin, Heidelberg},
  isbn = {978-3-642-15581-9},
  shorttitle = {isl},
  url = {http://www.springerlink.com/content/p4968q65x854510t/}
}

@INPROCEEDINGS{wang_code_2007,
  author = {Wang, Cheng and Chen, {Wei-Yu} and Wu, Youfeng and Saha, Bratin and
	{Adl-Tabatabai}, {Ali-Reza}},
  title = {Code Generation and Optimization for Transactional Memory Constructs
	in an Unmanaged Language},
  booktitle = {Code Generation and Optimization, 2007. {CGO} '07. International
	Symposium on},
  year = {2007},
  pages = {34--48},
  abstract = {Transactional memory offers significant advantages for concurrency
	control compared to locks. This paper presents the design and implementation
	of transactional memory constructs in an unmanaged language. Unmanaged
	languages pose a unique set of challenges to transactional memory
	constructs - for example, lack of type and memory safety, use of
	function pointers, aliasing of local variables, and others. This
	paper describes novel compiler and runtime mechanisms that address
	these challenges and optimize the performance of transactions in
	an unmanaged environment. We have implemented these mechanisms in
	a production-quality C compiler and a high-performance software transactional
	memory runtime. We measure the effectiveness of these optimizations
	and compare the performance of lock-based versus transaction-based
	programming on a set of concurrent data structures and the {SPLASH-2}
	benchmark suite. On a 16 processor {SMP} system, the transaction-based
	version of the {SPLASH-2} benchmarks scales much better than the
	coarse-grain locking version and performs comparably to the fine-grain
	locking version. Compiler optimizations significantly reduce the
	overheads of transactional memory so that, on a single thread, the
	transaction-based version incurs only about 6.4\% overhead compared
	to the lock-based version for the {SPLASH-2} benchmark suite. Thus,
	our system is the first to demonstrate that transactions integrate
	well with an unmanaged language, and can perform as well as fine-grain
	locking while providing the programming ease of coarse-grain locking
	even on an unmanaged environment},
  doi = {10.1109/CGO.2007.4},
  keywords = {coarse-grain locking, code generation, compiler optimizations, concurrency
	control, concurrent data structures, data structures, ok, optimising
	compilers, production-quality C compiler, resumed, software transactional
	memory runtime, {SPLASH-2} benchmark suite, storage management, transactional
	memory construct optimization, transaction-based programming, unmanaged
	language}
}

@INPROCEEDINGS{wong_parallel_2009,
  author = {Wong, Man Leung},
  title = {Parallel multi-objective evolutionary algorithms on graphics processing
	units},
  booktitle = {Proceedings of the 11th Annual Conference Companion on Genetic and
	Evolutionary Computation Conference: Late Breaking Papers},
  year = {2009},
  pages = {2515--2522},
  address = {Montreal, Qu\'{e}bec, Canada},
  publisher = {{ACM}},
  abstract = {Most real-life optimization problems or decision-making problems are
	multi-objective in nature, since they normally have several (possibly
	conflicting) objectives that must be satisfied at the same time.
	{Multi-Objective} Evolutionary Algorithms {(MOEAs)} have been gaining
	increasing attention among researchers and practitioners. However,
	they may execute for a long time for some difficult problems, because
	several evaluations must be performed. Moreover, the non-dominance
	checking and the non-dominated selection procedures are also very
	time consuming. From our experiments, more than 99\% of the execution
	time is used in performing the two procedures. A promising approach
	to overcome this limitation is to parallelize these algorithms. In
	this paper, we propose a parallel {MOEA} on consumer-level Graphics
	Processing Units {(GPU).} We perform many experiments on two-objective
	and three-objective benchmark problems to compare our parallel {MOEA}
	with a sequential {MOEA} and demonstrate that the former is much
	more efficient than the latter.},
  doi = {10.1145/1570256.1570354},
  isbn = {978-1-60558-505-5},
  keywords = {graphic processing units, multi-objective evolutionary algorithms,
	parallel programming},
  url = {http://portal.acm.org/citation.cfm?id=1570256.1570354&coll=GUIDE&dl=GUIDE&CFID=105156593&CFTOKEN=37140497}
}

@ARTICLE{wu_compiler_2009,
  author = {Wu, Peng and Michael, Maged M. and Praun, Christoph von and Nakaike,
	Takuya and Bordawekar, Rajesh and Cain, Harold W. and Cascaval, Calin
	and Chatterjee, Siddhartha and Chiras, Stefanie and Hou, Rui and
	Mergen, Mark and Shen, Xiaowei and Spear, Michael F. and Wang, Hua
	Yong and Wang, Kun},
  title = {Compiler and runtime techniques for software transactional memory
	optimization},
  journal = {Concurr. Comput. : Pract. Exper.},
  year = {2009},
  volume = {21},
  pages = {7--23},
  number = {1},
  abstract = {Software transactional memory {(STM)} systems are an attractive environment
	to evaluate optimistic concurrency. We describe our experience of
	supporting and optimizing an {STM} system at both the managed runtime
	and compiler levels. We describe the design policies of our {STM}
	system and the statistics collected by the runtime to identify performance
	bottlenecks and guide tuning decisions. We present an initial work
	on supporting automatic instrumentation of the {STM} primitives for
	{C-C++} and Java programs in the {IBM} {XL} compiler and J9 Java
	virtual machine. We evaluate and discuss the performance of several
	transactional programs running on our system. Copyright {\textcopyright}
	2008 John Wiley \& Sons, Ltd.},
  keywords = {compiler optimization, optimistic concurrency, software transactional
	memory, transactional memory},
  url = {http://portal.acm.org/citation.cfm?id=1464467}
}

@INPROCEEDINGS{yang_speculative_2005,
  author = {Yang, Chen and Lim, {Chu-Cheow}},
  title = {Speculative Parallel Threading Architecture and Compilation},
  booktitle = {Proceedings of the 2005 International Conference on Parallel Processing
	Workshops},
  year = {2005},
  pages = {285--294},
  publisher = {{IEEE} Computer Society},
  abstract = {Thread-level speculation is a technique that brings thread-level parallelism
	beyond the data-flow limit by executing a piece of code ahead of
	time speculatively before all its input data are ready. This technique
	appears particularly appealing for speeding uphard-to-parallelize
	applications. Although various thread-level speculation architectures
	and compilation techniques have been proposed by the research community,
	scalar applications remain difficult to be parallelized. It has not
	yet shown how well these applications can actually be benefited from
	thread-level speculation and if the performance gain is significant
	enough to justify the required hardware support. In an attempt to
	understand and realize the potential gain with thread-level speculation
	especially for scalar applications, we proposed an {SPT} {(Speculative}
	Parallel Threading) architecture and developed an {SPT} compiler
	to generate optimal speculatively parallelized code. Our evaluation
	showed that with our {SPT} approach 10 {SPECint2000} programs can
	achieve an average of 15.6\% speedup on a two-core {SPT} processor
	by exploiting only loop parallelism. This paper describes the {SPT}
	architecture and the {SPT} compiler which performs aggressive cost-driven
	loop selection and transformation, and presents our performance evaluation
	results.},
  isbn = {0-7695-2381-1},
  url = {http://portal.acm.org/citation.cfm?id=1078033.1079480&coll=GUIDE&dl=GUIDE&CFID=81819018&CFTOKEN=62199148}
}

@ARTICLE{richard_m._yoo_helper_????,
  author = {Richard M. Yoo and {HsienHsin} S. Lee},
  title = {Helper Transactions, Enabling {ThreadLevel} Speculation via A Transactional
	Memory System}
}

@INPROCEEDINGS{yoo_kicking_2008,
  author = {Yoo, Richard M. and Ni, Yang and Welc, Adam and Saha, Bratin and
	{Adl-Tabatabai}, {Ali-Reza} and Lee, {Hsien-Hsin} S.},
  title = {Kicking the tires of software transactional memory: why the going
	gets tough},
  booktitle = {Proceedings of the twentieth annual symposium on Parallelism in algorithms
	and architectures},
  year = {2008},
  pages = {265--274},
  address = {Munich, Germany},
  publisher = {{ACM}},
  abstract = {Transactional Memory {(TM)} promises to simplify concurrent programming,
	which has been notoriously difficult but crucial in realizing the
	performance benefit of multi-core processors. Software Transaction
	Memory {(STM)}, in particular, represents a body of important {TM}
	technologies since it provides a mechanism to run transactional programs
	when hardware {TM} support is not available, or when hardware {TM}
	resources are exhausted. Nonetheless, most previous researches on
	{STMs} were constrained to executing trivial, small-scale workloads.
	The assumption was that the same techniques applied to small-scale
	workloads could readily be applied to real-life, large-scale workloads.
	However, by executing several nontrivial workloads such as particle
	dynamics simulation and game physics engine on a state of the art
	{STM}, we noticed that this assumption does not hold. Specifically,
	we identified four major performance bottlenecks that were unique
	to the case of executing large-scale workloads on an {STM:} false
	conflicts, over-instrumentation, privatization-safety cost, and poor
	amortization. We believe that these bottlenecks would be common for
	any {STM} targeting real-world applications. In this paper, we describe
	those identified bottlenecks in detail, and we propose novel solutions
	to alleviate the issues. We also thoroughly validate these approaches
	with experimental results on real machines.},
  doi = {10.1145/1378533.1378582},
  isbn = {978-1-59593-973-9},
  keywords = {c/c++, compiler, measurement, performance, read if time, runtime,
	software transactional memory},
  shorttitle = {Kicking the tires of software transactional memory},
  url = {http://portal.acm.org/citation.cfm?id=1378533.1378582}
}

@ARTICLE{yu_techniques_2000,
  author = {Yu, H. and Rauchwerger, L.},
  title = {Techniques for Reducing the Overhead of {Run-Time} Parallelization},
  journal = {Lecture Notes in Computer Science},
  year = {2000},
  pages = {232{\textendash}248},
  keywords = {p++, read yes}
}

@INPROCEEDINGS{zhuang_exploiting_2009,
  author = {Zhuang, Xiaotong and Eichenberger, Alexandre E. and Luo, Yangchun
	and {O'Brien}, Kevin and {O'Brien}, Kathryn},
  title = {Exploiting Parallelism with {Dependence-Aware} Scheduling},
  booktitle = {Parallel Architectures and Compilation Techniques, International
	Conference on},
  year = {2009},
  pages = {193--202},
  address = {Los Alamitos, {CA}, {USA}},
  publisher = {{IEEE} Computer Society},
  abstract = {It is well known that a large fraction of applications cannot be parallelized
	at compile time due to unpredictable data dependences such as indirect
	memory accesses and/or memory accesses guarded by data-dependent
	conditional statements. A significant body of prior work attempts
	to parallelize such applications using runtime data-dependence analysis
	and scheduling. Performance is highly dependent on the ratio of the
	dependence analysis overheads with respect to the actual amount of
	parallelism available in the code. We have found that the overheads
	are often high and the available parallelism is often low when evaluating
	applications on a modern multicore processor. We propose a novel
	software-based approach called dependence-aware scheduling to parallelize
	loops with unknown data dependences. Unlike prior work, our main
	goal is to reduce the negative impact of dependence computation,
	such that when there is not an opportunity of getting speedup, the
	code can still run without much slowdown. If there is an opportunity,
	dependence-aware scheduling is able to yield very impressive speedup.
	Our results indicate that dependence-aware scheduling can greatly
	improve performance, with up to 4x speedups, for a number of computation
	intensive applications. Furthermore, the results also show negligible
	slowdowns in a stress test, where parallelism is continuously detected
	but not exploited.},
  doi = {http://doi.ieeecomputersociety.org/10.1109/PACT.2009.10},
  keywords = {inspector/executor, multicore, partial parallelism, runtime dependence
	analysis, thread scheduling}
}

@BOOK{zima_supercompilers_1991,
  title = {Supercompilers for parallel and vector computers},
  publisher = {{ACM}},
  year = {1991},
  author = {Zima, Hans and Chapman, Barbara},
  isbn = {0-201-17560-6},
  url = {http://portal.acm.org/citation.cfm?id=89627}
}

@Article{Fea92,
  author = 	 {P. Feautrier},
  title = 	 {Some Efficient Solutions to the Affine Scheduling
                  Problem, Part {II}, multidimensional time},
  journal = 	 {Int. J. on Parallel Programming},
  year = 	 1992,
  volume =	 21,
  number =	 6,
  pages =	 {389--420},
  month =	 dec,
  note =	 {See also Part {I}, one dimensional time, 21(5):315--348}
}

@Article{Bar97,
  author =       {D. Barthou and J.-F. Collard and P. Feautrier},
  title =        {Fuzzy Array Dataflow Analysis},
  journal =      {J. on Parallel and Distributed Computing},
  year =         1997,
  volume =       40,
  pages =        {210--226}
}

@InProceedings{Ben10,
  author =       {Mohamed-Walid Benabderrahmane and Louis-No\"el Pouchet and
                  Albert Cohen and C\'edric Bastoul},
  title =        {The Polyhedral Model Is More Widely Applicable
                  Than You Think},
  booktitle =    {Proceedings of the International Conference on
                  Compiler Construction (ETAPS CC'10)},
  year =         2010,
  series =       {LNCS},
  number =       6011,
  address =      {Paphos, Cyprus},
  month =        mar,
  publisher =    {Springer-Verlag}
}

@Article{Qui00,
  author = 	 {F. Quiller\'e and S. Rajopadhye},
  title = 	 {Optimizing Memory Usage in the Polyhedral Model},
  journal = 	 {ACM Trans. on Programming Languages and Systems},
  year = 	 2000,
  volume =	 22,
  number =	 5,
  pages =	 {773--815},
  month =	 sep
}

@inproceedings{Ren07,
  author = 	{Lakshminarayanan Renganarayanan and DaeGon Kim and Sanjay Rajopadhye and Michelle Mills Strout},
  title = 	{Parameterized tiled loops for free},
  booktitle = 	{Proc.\ of the 2007 {PLDI} Conf.},
  year = 	{2007}
}

@InProceedings{Bas04,
  author = 	 {C. Bastoul},
  title = 	 {Code generation in the polyhedral model is easier than you think},
  booktitle = 	 {IEEE Intl.\ Conf.\ on Parallel
                  Architectures and Compilation Techniques (PACT'04)},
  year =	 2004,
  pages =	 {7--16},
  month =	 sep,
  address =	 {Juan-les-Pins, France}
}

@Book{Sch86,
  author = 	 {A. Schrijver},
  title = 	 {Theory of linear and integer programming},
  publisher = 	 {John Wiley \& Sons},
  year = 	 1986
}

@INPROCEEDINGS{zyulkyarov_atomic_2009,
  author = {Zyulkyarov, Ferad and Gajinov, Vladimir and Unsal, Osman S. and Cristal,
	Adri\'{a}n and Ayguad\'{e}, Eduard and Harris, Tim and Valero, Mateo},
  title = {Atomic quake: using transactional memory in an interactive multiplayer
	game server},
  booktitle = {Proceedings of the 14th {ACM} {SIGPLAN} symposium on Principles and
	practice of parallel programming},
  year = {2009},
  pages = {25--34},
  address = {Raleigh, {NC}, {USA}},
  publisher = {{ACM}},
  abstract = {Transactional Memory {(TM)} is being studied widely as a new technique
	for synchronizing concurrent accesses to shared memory data structures
	for use in multi-core systems. Much of the initial work on {TM} has
	been evaluated using microbenchmarks and application kernels; it
	is not clear whether conclusions drawn from these workloads will
	apply to larger systems. In this work we make the first attempt to
	develop a large, complex, application that uses {TM} for all of its
	synchronization. We describe how we have taken an existing parallel
	implementation of the Quake game server and restructured it to use
	transactions. In doing so we have encountered examples where transactions
	simplify the structure of the program. We have also encountered cases
	where using transactions occludes the structure of the existing code.
	Compared with existing {TM} benchmarks, our workload exhibits non-block-structured
	transactions within which there are {I/Ooperations} and system call
	invocations. There are long and short running transactions {(200-1.3M}
	cycles) with small and large read and write sets (a few bytes to
	{1.5MB).} There are nested transactions reaching up to 9 levels at
	runtime. There are examples where error handling and recovery occurs
	inside transactions. There are also examples where data changes between
	being accessed transactionally and accessed non-transactionally.
	However, we did not see examples where the kind of access to one
	piece of data depended on the value of another.},
  doi = {10.1145/1504176.1504183},
  isbn = {978-1-60558-397-6},
  keywords = {benchmark, quake, transactional memory, workload},
  shorttitle = {Atomic quake},
  url = {http://portal.acm.org/citation.cfm?id=1504176.1504183}
}

@MISC{_1.184.potential_????,
  title = {{1.184.The} Potential for Using {Thread-Level} Data Speculation to
	Facilitate Automatic Parallelization (1998) .pdf}
}

@MISC{_1.93.automatic_????,
  title = {{1.93.Automatic} Program Parallelization (1993) .pdf}
}

@MISC{_8.softspec_????,
  title = {{8.Softspec}, Software-based Speculative Parallelism (2000).pdf}
}

@MISC{_atomic_????,
  title = {Atomic blocks Compiler optimization and techniques- {TRAMP2007-Sarkar.pdf}}
}

@MISC{_automatic_????,
  title = {Automatic Detection of Parallelism, A Grand Challenge for {High-Performance}
	Computing (1994).pdf}
}

@ARTICLE{_collective_????,
  title = {Collective Tuning Initiative, automating and accelerating development
	and optimization of computing systems}
}

@MISC{_compiler_????,
  title = {Compiler and Runtime Techniques for Software Transactional Memory
	Optimization -cpe08.pdf}
}

@MISC{_compiler_????-1,
  title = {Compiler techniques for concurrent multithreading with hardware speculation
	support (1996).pdf}
}

@MISC{_does_????,
  title = {Does Transactional Memory Keep Its Promises? Results from an Empirical
	Study.}
}

@MISC{_dynamic_????,
  title = {Dynamic Parallel Media Processing Using Speculative Broadcast Loop
	{(SBL)} {(Extended} Abstract) (2001).pdf}
}

@MISC{_efficient_????,
  title = {Efficient Interprocedural Array Data-flow Analysis for Automatic
	Program Parallelization (2000).pdf}
}

@MISC{_exposing_????,
  title = {Exposing Speculative Thread Parallelism in {SPEC2000-cavazos}, {hydra\_PPoPP05.pdf}}
}

@MISC{_genetic_????,
  title = {Genetic Programming on General Purpose Graphics Processing Units
	: gpgpgpu.com},
  howpublished = {http://www.gpgpgpu.com/},
  url = {http://www.gpgpgpu.com/}
}

@MISC{_genetic_????-1,
  title = {Genetic Programming on the G80 {GPU}},
  howpublished = {{http://www-lil.univ-littoral.fr/{\textasciitilde}robillia/GPUregression.html}},
  url = {http://www-lil.univ-littoral.fr/~robillia/GPUregression.html}
}

@MISC{_gpuregression:_????,
  title = {{GPURegression:} Population Parallel {GP} on G80 {GPUs}},
  howpublished = {{http://www.gpgpgpu.com/GPUregression.html}},
  url = {http://www.gpgpgpu.com/GPUregression.html}
}

@MISC{_hybrid_????,
  title = {Hybrid Dependence Analysis for Automatic Parallelization},
  url = {http://parasol.tamu.edu/publications/abstract.php?pub_id=366}
}

@MISC{_implementation_????,
  title = {Implementation of Speculative Parallelism in Functional Languages-94.pdf}
}

@MISC{_implementation_????-1,
  title = {Implementation Issues of Loop-level Speculative Run-time Parallelization
	(1999).pdf}
}

@MISC{_intel_????,
  title = {Intel architecture optimization guide.pdf},
  url = {http://www.intel.com/Assets/PDF/manual/248966.pdf}
}

@MISC{_loop-level_????,
  title = {Loop-level Speculative Parallelism in Embedded {Applications-ICPP2007-Final.pd}}
}

@ARTICLE{_milepost_????,
  title = {{MILEPOST} {GCC}, machine learning based research compiler}
}

@MISC{_new_????,
  title = {A New Scheduling Strategy for Speculative Parallelization of Randomized
	Incremental Algorithms (2005).pdf},
  keywords = {read ?}
}

@MISC{_new_????-1,
  title = {The new framework for loop nest optimization in {GCC:} from prototyping
	to {evaluation-Cohen-PCJS06.ps.gz}}
}

@MISC{_parallelization_????,
  title = {Parallelization of {Branch-and-Bound} Algorithms in a Functional
	Programming Environment (1992).pdf}
}

@MISC{_parallelizing_????,
  title = {Parallelizing While Loops for Multiprocessor Systems (1995).pdf}
}

@MISC{_poster_speculative_parallel_threading.pdf_????,
  title = {{poster\_Speculative\_Parallel\_Threading.pdf}}
}

@MISC{_principles_????,
  title = {Principles of Speculative Run-time Parallelization (1998).pdf}
}

@ARTICLE{_rose_????,
  title = {{ROSE} {II:} An Optimizing Code Transformer for C {Object-Oriented}
	Array Class Libraries},
  shorttitle = {{ROSE} {II}},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.5161}
}

@MISC{_search_????,
  title = {In search of speculative thread level parallelism.oplinger99.ps}
}

@MISC{_speculative_????,
  title = {Speculative Parallel Execution of Loops with {Cross-Iteration} Dependences
	in {DSM} Multiprocessors .pdf}
}

@MISC{_speculative_????-1,
  title = {Speculative Parallelization of Partially Parallel Loops (2000).pdf}
}

@ARTICLE{_stampstanford_????,
  title = {{STAMP,Stanford} Transactional Applications for {Multi-Processing-tcc\_iiswc2008}}
}

@MISC{_supporting_????,
  title = {Supporting Array Dependence Testing for an Optimizing Parallelizing
	C Compiler (1993) .pdf}
}

@MISC{_survey_????,
  title = {A Survey of Speculative Parallelism.pdf},
  keywords = {ok}
}

@MISC{_techniques_????,
  title = {Techniques for Speculative {Run-Time} Parallelization of Loops (1998)
	.pdf}
}

@MISC{_techniques_????-1,
  title = {techniques for speculative of run time parallelisation of loops-sc98.ps}
}

@MISC{_thread-level_????,
  title = {Thread-level Speculative Parallelization(beyond 2005).pdf}
}

@MISC{_uncovering_????,
  title = {Uncovering Hidden Loop Level Parallelism in Sequential Applications-hongtaoz-hpca08.pdf}
}

